[clang] [compiler-rt] [flang] [libc] [libcxx] [lldb] [llvm] [RFC][Clang] Add __int256/__uint256 builtin types (PR #182733)

Xavier Roche via cfe-commits cfe-commits at lists.llvm.org
Mon Mar 2 02:54:25 PST 2026


https://github.com/xroche updated https://github.com/llvm/llvm-project/pull/182733

>From 4ad63284b520eb707abb6219854df22225ed7b52 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:37:59 +0100
Subject: [PATCH 01/17] [clang] Add __int256/__uint256 builtin type definitions

Define Int256 and UInt256 as new builtin types alongside __int128/__uint128.
Add type specifiers, token kinds, target info queries (hasInt256Type(),
getInt256Align()), and target-specific overrides (SPIR disables).
Plumbs through BuiltinTypes.def, TypeBase.h, Specifiers.h, DeclID.h,
TypeLoc.h, TokenKinds.def, and TargetInfo.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 clang/include/clang/AST/BuiltinTypes.def |  6 ++++++
 clang/include/clang/AST/DeclID.h         |  6 ++++++
 clang/include/clang/AST/TypeBase.h       | 10 ++++------
 clang/include/clang/AST/TypeLoc.h        |  2 +-
 clang/include/clang/Basic/Specifiers.h   |  1 +
 clang/include/clang/Basic/TargetInfo.h   |  9 +++++++++
 clang/include/clang/Basic/TokenKinds.def |  1 +
 clang/lib/Basic/TargetInfo.cpp           |  1 +
 clang/lib/Basic/Targets/SPIR.h           |  4 ++++
 9 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/AST/BuiltinTypes.def b/clang/include/clang/AST/BuiltinTypes.def
index 444be4311a743..5af242566d84f 100644
--- a/clang/include/clang/AST/BuiltinTypes.def
+++ b/clang/include/clang/AST/BuiltinTypes.def
@@ -95,6 +95,9 @@ UNSIGNED_TYPE(ULongLong, UnsignedLongLongTy)
 // '__uint128_t'
 UNSIGNED_TYPE(UInt128, UnsignedInt128Ty)
 
+// '__uint256_t'
+UNSIGNED_TYPE(UInt256, UnsignedInt256Ty)
+
 //===- Signed Types -------------------------------------------------------===//
 
 // 'char' for targets where it's signed
@@ -121,6 +124,9 @@ SIGNED_TYPE(LongLong, LongLongTy)
 // '__int128_t'
 SIGNED_TYPE(Int128, Int128Ty)
 
+// '__int256_t'
+SIGNED_TYPE(Int256, Int256Ty)
+
 //===- Fixed point types --------------------------------------------------===//
 
 // 'short _Accum'
diff --git a/clang/include/clang/AST/DeclID.h b/clang/include/clang/AST/DeclID.h
index 47ae05b2747ae..801defab4dfe5 100644
--- a/clang/include/clang/AST/DeclID.h
+++ b/clang/include/clang/AST/DeclID.h
@@ -53,6 +53,12 @@ enum PredefinedDeclIDs {
   /// The unsigned 128-bit integer type.
   PREDEF_DECL_UNSIGNED_INT_128_ID,
 
+  /// The signed 256-bit integer type.
+  PREDEF_DECL_INT_256_ID,
+
+  /// The unsigned 256-bit integer type.
+  PREDEF_DECL_UNSIGNED_INT_256_ID,
+
   /// The internal 'instancetype' typedef.
   PREDEF_DECL_OBJC_INSTANCETYPE_ID,
 
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 9402469f5e12b..dba08422ca8d2 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -1935,7 +1935,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned : NumTypeBits;
 
     /// The kind (BuiltinType::Kind) of builtin type this is.
-    static constexpr unsigned NumOfBuiltinTypeBits = 9;
+    static constexpr unsigned NumOfBuiltinTypeBits = 10;
     unsigned Kind : NumOfBuiltinTypeBits;
   };
 
@@ -3230,16 +3230,14 @@ class BuiltinType : public Type {
   bool isSugared() const { return false; }
   QualType desugar() const { return QualType(this, 0); }
 
-  bool isInteger() const {
-    return getKind() >= Bool && getKind() <= Int128;
-  }
+  bool isInteger() const { return getKind() >= Bool && getKind() <= Int256; }
 
   bool isSignedInteger() const {
-    return getKind() >= Char_S && getKind() <= Int128;
+    return getKind() >= Char_S && getKind() <= Int256;
   }
 
   bool isUnsignedInteger() const {
-    return getKind() >= Bool && getKind() <= UInt128;
+    return getKind() >= Bool && getKind() <= UInt256;
   }
 
   bool isFloatingPoint() const {
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 24df18dbaace4..5c972c857d1dc 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -605,7 +605,7 @@ class BuiltinTypeLoc : public ConcreteTypeLoc<UnqualTypeLoc,
 
   bool needsExtraLocalData() const {
     BuiltinType::Kind bk = getTypePtr()->getKind();
-    return (bk >= BuiltinType::UShort && bk <= BuiltinType::UInt128) ||
+    return (bk >= BuiltinType::UShort && bk <= BuiltinType::UInt256) ||
            (bk >= BuiltinType::Short && bk <= BuiltinType::Ibm128) ||
            bk == BuiltinType::UChar || bk == BuiltinType::SChar;
   }
diff --git a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
index 118c3b75aed95..ca644b7d01392 100644
--- a/clang/include/clang/Basic/Specifiers.h
+++ b/clang/include/clang/Basic/Specifiers.h
@@ -62,6 +62,7 @@ namespace clang {
     TST_char32, // C++11 char32_t
     TST_int,
     TST_int128,
+    TST_int256,
     TST_bitint,  // Bit-precise integer types.
     TST_half,    // OpenCL half, ARM NEON __fp16
     TST_Float16, // C11 extension ISO/IEC TS 18661-3
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index ec6cd2be7c3c5..61b5d80f4f102 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -100,6 +100,7 @@ struct TransferrableTargetInfo {
   unsigned char LongWidth, LongAlign;
   unsigned char LongLongWidth, LongLongAlign;
   unsigned char Int128Align;
+  unsigned short Int256Align;
 
   // This is an optional parameter for targets that
   // don't use 'LongLongAlign' for '_BitInt' max alignment
@@ -543,6 +544,9 @@ class TargetInfo : public TransferrableTargetInfo,
   /// getInt128Align() - Returns the alignment of Int128.
   unsigned getInt128Align() const { return Int128Align; }
 
+  /// getInt256Align() - Returns the alignment of Int256.
+  unsigned getInt256Align() const { return Int256Align; }
+
   /// getBitIntMaxAlign() - Returns the maximum possible alignment of
   /// '_BitInt' and 'unsigned _BitInt'.
   unsigned getBitIntMaxAlign() const {
@@ -680,6 +684,11 @@ class TargetInfo : public TransferrableTargetInfo,
            getTargetOpts().ForceEnableInt128;
   } // FIXME
 
+  /// Determine whether the __int256 type is supported on this target.
+  virtual bool hasInt256Type() const {
+    return getPointerWidth(LangAS::Default) >= 64;
+  }
+
   /// Determine whether the _BitInt type is supported on this target. This
   /// limitation is put into place for ABI reasons.
   /// FIXME: _BitInt is a required type in C23, so there's not much utility in
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8b9f613037718..0ebd1743f8ba0 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -488,6 +488,7 @@ KEYWORD(__float128                  , KEYALL)
 KEYWORD(__ibm128                    , KEYALL)
 KEYWORD(__imag                      , KEYALL)
 KEYWORD(__int128                    , KEYALL)
+KEYWORD(__int256                    , KEYALL)
 KEYWORD(__label__                   , KEYALL)
 KEYWORD(__real                      , KEYALL)
 KEYWORD(__thread                    , KEYALL)
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 794621c4b3e1f..c04bde68c937d 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -82,6 +82,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   LongWidth = LongAlign = 32;
   LongLongWidth = LongLongAlign = 64;
   Int128Align = 128;
+  Int256Align = 128;
 
   // Fixed point default bit widths
   ShortAccumWidth = ShortAccumAlign = 16;
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index eef9521c7434a..a8d60e7714295 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -242,6 +242,8 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo {
   bool hasBitIntType() const override { return true; }
 
   bool hasInt128Type() const override { return false; }
+
+  bool hasInt256Type() const override { return false; }
 };
 
 class LLVM_LIBRARY_VISIBILITY SPIRTargetInfo : public BaseSPIRTargetInfo {
@@ -478,6 +480,8 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
   }
 
   bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+  bool hasInt256Type() const override { return TargetInfo::hasInt256Type(); }
 };
 
 class LLVM_LIBRARY_VISIBILITY SPIRV64IntelTargetInfo final

>From 6d9f71a193f178234c98f97be477220572146a1a Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:16 +0100
Subject: [PATCH 02/17] [clang] Add __int256/__uint256 AST, parser, sema, and
 codegen

Wire __int256/__uint256 through all clang subsystems:
- AST: context queries, constant evaluation, mangling (Itanium/MSVC),
  printf format strings, record layout, type printing, type locs
- Parser: declaration specifiers, expression parsing, tentative parsing
- Sema: type checking, overload resolution, template variadic support,
  atomic/bitfield constraints, tautological comparisons
- CodeGen: LLVM type mapping, debug info (DW_ATE_signed/unsigned_256),
  TBAA, Swift calling convention, X86 ABI, CIR
- Serialization: AST reader/writer with new type ID
- Frontend: preprocessor macros (__INT256_MAX__, etc.)
- Index: USR generation

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 clang/include/clang/AST/ASTContext.h          | 17 ++++++-
 clang/include/clang/Sema/DeclSpec.h           |  1 +
 .../include/clang/Serialization/ASTBitCodes.h |  8 +++-
 clang/lib/AST/ASTContext.cpp                  | 36 +++++++++++++++
 clang/lib/AST/ExprConstant.cpp                |  1 +
 clang/lib/AST/ItaniumMangle.cpp               | 18 ++++++++
 clang/lib/AST/MicrosoftMangle.cpp             |  6 +++
 clang/lib/AST/NSAPI.cpp                       |  2 +
 clang/lib/AST/PrintfFormatString.cpp          |  2 +
 clang/lib/AST/RecordLayoutBuilder.cpp         |  1 +
 clang/lib/AST/StmtPrinter.cpp                 |  4 ++
 clang/lib/AST/Type.cpp                        |  4 ++
 clang/lib/AST/TypeLoc.cpp                     |  2 +
 clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp |  2 +
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  2 +
 clang/lib/CIR/CodeGen/CIRGenTypeCache.h       |  2 +
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp         |  2 +
 clang/lib/CodeGen/CGDebugInfo.cpp             |  2 +
 clang/lib/CodeGen/CodeGenModule.cpp           |  5 +++
 clang/lib/CodeGen/CodeGenTBAA.cpp             |  2 +
 clang/lib/CodeGen/CodeGenTypes.cpp            |  5 +++
 clang/lib/CodeGen/ItaniumCXXABI.cpp           | 44 ++++++++++++-------
 clang/lib/CodeGen/SwiftCallingConv.cpp        |  3 ++
 clang/lib/CodeGen/Targets/X86.cpp             | 29 ++++++++++--
 clang/lib/Frontend/InitPreprocessor.cpp       |  2 +
 clang/lib/Index/USRGeneration.cpp             |  6 +++
 clang/lib/Parse/ParseDecl.cpp                 |  7 +++
 clang/lib/Parse/ParseExpr.cpp                 |  1 +
 clang/lib/Parse/ParseExprCXX.cpp              |  3 ++
 clang/lib/Parse/ParseTentative.cpp            |  2 +
 clang/lib/Sema/DeclSpec.cpp                   |  8 +++-
 clang/lib/Sema/Sema.cpp                       | 14 ++++++
 clang/lib/Sema/SemaChecking.cpp               | 10 ++---
 clang/lib/Sema/SemaOverload.cpp               |  8 ++++
 clang/lib/Sema/SemaTemplateVariadic.cpp       |  1 +
 clang/lib/Sema/SemaType.cpp                   | 28 ++++++++----
 clang/lib/Sema/TreeTransform.h                |  8 ++--
 clang/lib/Serialization/ASTCommon.cpp         |  6 +++
 clang/lib/Serialization/ASTReader.cpp         | 18 ++++++++
 clang/lib/Serialization/ASTWriter.cpp         |  2 +
 40 files changed, 283 insertions(+), 41 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 05302c30d18d1..2a552066afb8f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -433,6 +433,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// The typedef for the __uint128_t type.
   mutable TypedefDecl *UInt128Decl = nullptr;
 
+  /// The typedef for the __int256_t type.
+  mutable TypedefDecl *Int256Decl = nullptr;
+
+  /// The typedef for the __uint256_t type.
+  mutable TypedefDecl *UInt256Decl = nullptr;
+
   /// The typedef for the target specific predefined
   /// __builtin_va_list type.
   mutable TypedefDecl *BuiltinVaListDecl = nullptr;
@@ -1296,9 +1302,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   CanQualType Char8Ty;  // [C++20 proposal]
   CanQualType Char16Ty; // [C++0x 3.9.1p5], integer type in C99.
   CanQualType Char32Ty; // [C++0x 3.9.1p5], integer type in C99.
-  CanQualType SignedCharTy, ShortTy, IntTy, LongTy, LongLongTy, Int128Ty;
+  CanQualType SignedCharTy, ShortTy, IntTy, LongTy, LongLongTy, Int128Ty,
+      Int256Ty;
   CanQualType UnsignedCharTy, UnsignedShortTy, UnsignedIntTy, UnsignedLongTy;
-  CanQualType UnsignedLongLongTy, UnsignedInt128Ty;
+  CanQualType UnsignedLongLongTy, UnsignedInt128Ty, UnsignedInt256Ty;
   CanQualType FloatTy, DoubleTy, LongDoubleTy, Float128Ty, Ibm128Ty;
   CanQualType ShortAccumTy, AccumTy,
       LongAccumTy;  // ISO/IEC JTC1 SC22 WG14 N1169 Extension
@@ -1448,6 +1455,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// Retrieve the declaration for the 128-bit unsigned integer type.
   TypedefDecl *getUInt128Decl() const;
 
+  /// Retrieve the declaration for the 256-bit signed integer type.
+  TypedefDecl *getInt256Decl() const;
+
+  /// Retrieve the declaration for the 256-bit unsigned integer type.
+  TypedefDecl *getUInt256Decl() const;
+
   //===--------------------------------------------------------------------===//
   //                           Type Constructors
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 6e5421c7072c7..40661f25fa65e 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -255,6 +255,7 @@ class DeclSpec {
   static const TST TST_char32 = clang::TST_char32;
   static const TST TST_int = clang::TST_int;
   static const TST TST_int128 = clang::TST_int128;
+  static const TST TST_int256 = clang::TST_int256;
   static const TST TST_bitint = clang::TST_bitint;
   static const TST TST_half = clang::TST_half;
   static const TST TST_BFloat16 = clang::TST_BFloat16;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index d72f1f9db86b2..9916b5cd9369b 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1126,6 +1126,12 @@ enum PredefinedTypeIDs {
   /// \brief The '__ibm128' type
   PREDEF_TYPE_IBM128_ID = 74,
 
+  /// The '__uint256_t' type.
+  PREDEF_TYPE_UINT256_ID = 75,
+
+  /// The '__int256_t' type.
+  PREDEF_TYPE_INT256_ID = 76,
+
 /// OpenCL image types with auto numeration
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix)                   \
   PREDEF_TYPE_##Id##_ID,
@@ -1163,7 +1169,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 514;
+const unsigned NUM_PREDEF_TYPE_IDS = 516;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 5fbdff280073f..a9f79015e8483 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1215,6 +1215,18 @@ TypedefDecl *ASTContext::getUInt128Decl() const {
   return UInt128Decl;
 }
 
+TypedefDecl *ASTContext::getInt256Decl() const {
+  if (!Int256Decl)
+    Int256Decl = buildImplicitTypedef(Int256Ty, "__int256_t");
+  return Int256Decl;
+}
+
+TypedefDecl *ASTContext::getUInt256Decl() const {
+  if (!UInt256Decl)
+    UInt256Decl = buildImplicitTypedef(UnsignedInt256Ty, "__uint256_t");
+  return UInt256Decl;
+}
+
 void ASTContext::InitBuiltinType(CanQualType &R, BuiltinType::Kind K) {
   auto *Ty = new (*this, alignof(BuiltinType)) BuiltinType(K);
   R = CanQualType::CreateUnsafe(QualType(Ty, 0));
@@ -1301,6 +1313,10 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
   InitBuiltinType(Int128Ty,            BuiltinType::Int128);
   InitBuiltinType(UnsignedInt128Ty,    BuiltinType::UInt128);
 
+  // Extension, 256-bit integers.
+  InitBuiltinType(Int256Ty, BuiltinType::Int256);
+  InitBuiltinType(UnsignedInt256Ty, BuiltinType::UInt256);
+
   // C++ 3.9.1p5
   if (TargetInfo::isTypeSigned(Target.getWCharType()))
     InitBuiltinType(WCharTy,           BuiltinType::WChar_S);
@@ -2174,6 +2190,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
       Width = 128;
       Align = Target->getInt128Align();
       break;
+    case BuiltinType::Int256:
+    case BuiltinType::UInt256:
+      Width = 256;
+      Align = Target->getInt256Align();
+      break;
     case BuiltinType::ShortAccum:
     case BuiltinType::UShortAccum:
     case BuiltinType::SatShortAccum:
@@ -8233,6 +8254,11 @@ unsigned ASTContext::getIntegerRank(const Type *T) const {
   case BuiltinType::UInt128:
     return 7 + (getIntWidth(Int128Ty) << 3);
 
+  case BuiltinType::Int256:
+  case BuiltinType::UInt256:
+    // Base rank > 7 is fine: only the total ordering matters, not the low bits.
+    return 8 + (getIntWidth(Int256Ty) << 3);
+
   // "The ranks of char8_t, char16_t, char32_t, and wchar_t equal the ranks of
   // their underlying types" [c++20 conv.rank]
   case BuiltinType::Char8:
@@ -9165,6 +9191,8 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
     case BuiltinType::ULong:
         return C->getTargetInfo().getLongWidth() == 32 ? 'L' : 'Q';
     case BuiltinType::UInt128:    return 'T';
+    case BuiltinType::UInt256:
+      return 'W';
     case BuiltinType::ULongLong:  return 'Q';
     case BuiltinType::Char_S:
     case BuiltinType::SChar:      return 'c';
@@ -9176,6 +9204,8 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
       return C->getTargetInfo().getLongWidth() == 32 ? 'l' : 'q';
     case BuiltinType::LongLong:   return 'q';
     case BuiltinType::Int128:     return 't';
+    case BuiltinType::Int256:
+      return 'w';
     case BuiltinType::Float:      return 'f';
     case BuiltinType::Double:     return 'd';
     case BuiltinType::LongDouble: return 'D';
@@ -12388,6 +12418,8 @@ QualType ASTContext::getCorrespondingUnsignedType(QualType T) const {
     return UnsignedLongLongTy;
   case BuiltinType::Int128:
     return UnsignedInt128Ty;
+  case BuiltinType::Int256:
+    return UnsignedInt256Ty;
   // wchar_t is special. It is either signed or not, but when it's signed,
   // there's no matching "unsigned wchar_t". Therefore we return the unsigned
   // version of its underlying type instead.
@@ -12462,6 +12494,8 @@ QualType ASTContext::getCorrespondingSignedType(QualType T) const {
     return LongLongTy;
   case BuiltinType::UInt128:
     return Int128Ty;
+  case BuiltinType::UInt256:
+    return Int256Ty;
   // wchar_t is special. It is either unsigned or not, but when it's unsigned,
   // there's no matching "signed wchar_t". Therefore we return the signed
   // version of its underlying type instead.
@@ -13466,6 +13500,8 @@ QualType ASTContext::getIntTypeForBitwidth(unsigned DestWidth,
   CanQualType QualTy = getFromTargetType(Ty);
   if (!QualTy && DestWidth == 128)
     return Signed ? Int128Ty : UnsignedInt128Ty;
+  if (!QualTy && DestWidth == 256)
+    return Signed ? Int256Ty : UnsignedInt256Ty;
   return QualTy;
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index feea97cd67534..ea985428d0251 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15381,6 +15381,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T,
     case BuiltinType::ULong:
     case BuiltinType::ULongLong:
     case BuiltinType::UInt128:
+    case BuiltinType::UInt256:
       return GCCTypeClass::Integer;
 
     case BuiltinType::UShortAccum:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 1faf7f1466e39..3e7a5fcc8a492 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3177,6 +3177,12 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
         Out << "u4i128";
         addSubstitution(BuiltinType::Int128);
         break;
+      case 256:
+        if (mangleSubstitution(BuiltinType::Int256))
+          break;
+        Out << "u4i256";
+        addSubstitution(BuiltinType::Int256);
+        break;
       default:
         llvm_unreachable("Unknown integer size for normalization");
       }
@@ -3212,6 +3218,12 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
         Out << "u4u128";
         addSubstitution(BuiltinType::UInt128);
         break;
+      case 256:
+        if (mangleSubstitution(BuiltinType::UInt256))
+          break;
+        Out << "u4u256";
+        addSubstitution(BuiltinType::UInt256);
+        break;
       default:
         llvm_unreachable("Unknown integer size for normalization");
       }
@@ -3247,6 +3259,9 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
   case BuiltinType::UInt128:
     Out << 'o';
     break;
+  case BuiltinType::UInt256:
+    Out << "u8__uint256";
+    break;
   case BuiltinType::SChar:
     Out << 'a';
     break;
@@ -3278,6 +3293,9 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
   case BuiltinType::Int128:
     Out << 'n';
     break;
+  case BuiltinType::Int256:
+    Out << "u7__int256";
+    break;
   case BuiltinType::Float16:
     Out << "DF16_";
     break;
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 1f28d281be9fe..e499551da5c83 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2723,6 +2723,12 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
   case BuiltinType::UInt128:
     Out << "_M";
     break;
+  case BuiltinType::Int256:
+    Out << "$$_L";
+    break;
+  case BuiltinType::UInt256:
+    Out << "$$_M";
+    break;
   case BuiltinType::Bool:
     Out << "_N";
     break;
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index 17f5ee5dee3d1..43d964e2edca6 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -399,6 +399,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
   case BuiltinType::Char16:
   case BuiltinType::Char32:
   case BuiltinType::Int128:
+  case BuiltinType::Int256:
   case BuiltinType::LongDouble:
   case BuiltinType::ShortAccum:
   case BuiltinType::Accum:
@@ -425,6 +426,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
   case BuiltinType::SatUFract:
   case BuiltinType::SatULongFract:
   case BuiltinType::UInt128:
+  case BuiltinType::UInt256:
   case BuiltinType::Float16:
   case BuiltinType::Float128:
   case BuiltinType::Ibm128:
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 855550475721a..bf821ad4e3049 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -820,6 +820,8 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt,
   case BuiltinType::Char32:
   case BuiltinType::UInt128:
   case BuiltinType::Int128:
+  case BuiltinType::UInt256:
+  case BuiltinType::Int256:
   case BuiltinType::Half:
   case BuiltinType::BFloat16:
   case BuiltinType::Float16:
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 0b1bf813efd10..6f7f09211bd0b 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -1470,6 +1470,7 @@ void ItaniumRecordLayoutBuilder::LayoutWideBitField(uint64_t FieldSize,
       Context.UnsignedCharTy,     Context.UnsignedShortTy,
       Context.UnsignedIntTy,      Context.UnsignedLongTy,
       Context.UnsignedLongLongTy, Context.UnsignedInt128Ty,
+      Context.UnsignedInt256Ty,
   };
 
   QualType Type;
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index f4ce4a7573aab..78a8e024d7f0d 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1506,6 +1506,10 @@ void StmtPrinter::VisitIntegerLiteral(IntegerLiteral *Node) {
     break; // no suffix.
   case BuiltinType::UInt128:
     break; // no suffix.
+  case BuiltinType::Int256:
+    break; // no suffix.
+  case BuiltinType::UInt256:
+    break; // no suffix.
   case BuiltinType::WChar_S:
   case BuiltinType::WChar_U:
     break; // no suffix
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index a85f08753a132..dbe50a7f4f927 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3446,6 +3446,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
     return "long long";
   case Int128:
     return "__int128";
+  case Int256:
+    return "__int256";
   case UChar:
     return "unsigned char";
   case UShort:
@@ -3458,6 +3460,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
     return "unsigned long long";
   case UInt128:
     return "unsigned __int128";
+  case UInt256:
+    return "unsigned __int256";
   case Half:
     return Policy.Half ? "half" : "__fp16";
   case BFloat16:
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 53edfdb65a4d5..1766ca37c9a65 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -344,12 +344,14 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
   case BuiltinType::ULong:
   case BuiltinType::ULongLong:
   case BuiltinType::UInt128:
+  case BuiltinType::UInt256:
   case BuiltinType::SChar:
   case BuiltinType::Short:
   case BuiltinType::Int:
   case BuiltinType::Long:
   case BuiltinType::LongLong:
   case BuiltinType::Int128:
+  case BuiltinType::Int256:
   case BuiltinType::Half:
   case BuiltinType::Float:
   case BuiltinType::Double:
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index a18e2b91b1dd4..3e1ef878f981b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -669,6 +669,8 @@ static bool typeInfoIsInStandardLibrary(const BuiltinType *ty) {
   case BuiltinType::Char32:
   case BuiltinType::Int128:
   case BuiltinType::UInt128:
+  case BuiltinType::Int256:
+  case BuiltinType::UInt256:
     return true;
 
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix)                   \
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 223b53731359a..a0328e1523eb2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -82,6 +82,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   sInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
   sInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
   sInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
+  sInt256Ty = cir::IntType::get(&getMLIRContext(), 256, /*isSigned=*/true);
   uInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
   uInt8PtrTy = cir::PointerType::get(uInt8Ty);
   cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace();
@@ -89,6 +90,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   uInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
   uInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
   uInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
+  uInt256Ty = cir::IntType::get(&getMLIRContext(), 256, /*isSigned=*/false);
   fP16Ty = cir::FP16Type::get(&getMLIRContext());
   bFloat16Ty = cir::BF16Type::get(&getMLIRContext());
   floatTy = cir::SingleType::get(&getMLIRContext());
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index 4f3c319816e3a..cd4227501c94c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -35,6 +35,7 @@ struct CIRGenTypeCache {
   cir::IntType sInt32Ty;
   cir::IntType sInt64Ty;
   cir::IntType sInt128Ty;
+  cir::IntType sInt256Ty;
 
   // ClangIR unsigned integral type of common sizes
   cir::IntType uInt8Ty;
@@ -42,6 +43,7 @@ struct CIRGenTypeCache {
   cir::IntType uInt32Ty;
   cir::IntType uInt64Ty;
   cir::IntType uInt128Ty;
+  cir::IntType uInt256Ty;
 
   // ClangIR floating-point types with fixed formats
   cir::FP16Type fP16Ty;
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index d5641441b2384..8c96bcad11459 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -315,6 +315,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     case BuiltinType::Char_S:
     case BuiltinType::Int:
     case BuiltinType::Int128:
+    case BuiltinType::Int256:
     case BuiltinType::Long:
     case BuiltinType::LongLong:
     case BuiltinType::SChar:
@@ -387,6 +388,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     case BuiltinType::UChar:
     case BuiltinType::UInt:
     case BuiltinType::UInt128:
+    case BuiltinType::UInt256:
     case BuiltinType::ULong:
     case BuiltinType::ULongLong:
     case BuiltinType::UShort:
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 1758513a2844b..698cc97b49e8a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1122,6 +1122,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
   case BuiltinType::UShort:
   case BuiltinType::UInt:
   case BuiltinType::UInt128:
+  case BuiltinType::UInt256:
   case BuiltinType::ULong:
   case BuiltinType::WChar_U:
   case BuiltinType::ULongLong:
@@ -1130,6 +1131,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
   case BuiltinType::Short:
   case BuiltinType::Int:
   case BuiltinType::Int128:
+  case BuiltinType::Int256:
   case BuiltinType::Long:
   case BuiltinType::WChar_S:
   case BuiltinType::LongLong:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c31bcabe49016..107ef39106c18 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -375,6 +375,11 @@ static void checkDataLayoutConsistency(const TargetInfo &Target,
       Triple.getArch() != llvm::Triple::ve)
     Check("__int128", llvm::Type::getIntNTy(Context, 128), Target.Int128Align);
 
+  if (Target.hasInt256Type() && !Triple.isAMDGPU() && !Triple.isSPIRV() &&
+      Triple.getArch() != llvm::Triple::ve &&
+      Triple.getArch() != llvm::Triple::systemz)
+    Check("__int256", llvm::Type::getIntNTy(Context, 256), Target.Int256Align);
+
   if (Target.hasFloat16Type())
     Check("half", llvm::Type::getFloatingPointTy(Context, *Target.HalfFormat),
           Target.HalfAlign);
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index cd08f3ec397a0..8a7743019c8e8 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -184,6 +184,8 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
       return getTypeInfo(Context.LongLongTy);
     case BuiltinType::UInt128:
       return getTypeInfo(Context.Int128Ty);
+    case BuiltinType::UInt256:
+      return getTypeInfo(Context.Int256Ty);
 
     case BuiltinType::UShortFract:
       return getTypeInfo(Context.ShortFractTy);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 6bd79056e599a..5014cc9cea6b5 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -520,6 +520,11 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       ResultType = llvm::IntegerType::get(getLLVMContext(), 128);
       break;
 
+    case BuiltinType::UInt256:
+    case BuiltinType::Int256:
+      ResultType = llvm::IntegerType::get(getLLVMContext(), 256);
+      break;
+
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
     case BuiltinType::Id:
 #include "clang/Basic/OpenCLImageTypes.def"
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 52768a8300a20..5a71293510796 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3726,6 +3726,8 @@ static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
     case BuiltinType::Char32:
     case BuiltinType::Int128:
     case BuiltinType::UInt128:
+    case BuiltinType::Int256:
+    case BuiltinType::UInt256:
       return true;
 
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
@@ -4667,21 +4669,33 @@ llvm::Constant *ItaniumCXXABI::getAddrOfRTTIDescriptor(QualType Ty) {
 
 void ItaniumCXXABI::EmitFundamentalRTTIDescriptors(const CXXRecordDecl *RD) {
   // Types added here must also be added to TypeInfoIsInStandardLibrary.
-  QualType FundamentalTypes[] = {
-      getContext().VoidTy,             getContext().NullPtrTy,
-      getContext().BoolTy,             getContext().WCharTy,
-      getContext().CharTy,             getContext().UnsignedCharTy,
-      getContext().SignedCharTy,       getContext().ShortTy,
-      getContext().UnsignedShortTy,    getContext().IntTy,
-      getContext().UnsignedIntTy,      getContext().LongTy,
-      getContext().UnsignedLongTy,     getContext().LongLongTy,
-      getContext().UnsignedLongLongTy, getContext().Int128Ty,
-      getContext().UnsignedInt128Ty,   getContext().HalfTy,
-      getContext().FloatTy,            getContext().DoubleTy,
-      getContext().LongDoubleTy,       getContext().Float128Ty,
-      getContext().Char8Ty,            getContext().Char16Ty,
-      getContext().Char32Ty
-  };
+  QualType FundamentalTypes[] = {getContext().VoidTy,
+                                 getContext().NullPtrTy,
+                                 getContext().BoolTy,
+                                 getContext().WCharTy,
+                                 getContext().CharTy,
+                                 getContext().UnsignedCharTy,
+                                 getContext().SignedCharTy,
+                                 getContext().ShortTy,
+                                 getContext().UnsignedShortTy,
+                                 getContext().IntTy,
+                                 getContext().UnsignedIntTy,
+                                 getContext().LongTy,
+                                 getContext().UnsignedLongTy,
+                                 getContext().LongLongTy,
+                                 getContext().UnsignedLongLongTy,
+                                 getContext().Int128Ty,
+                                 getContext().UnsignedInt128Ty,
+                                 getContext().Int256Ty,
+                                 getContext().UnsignedInt256Ty,
+                                 getContext().HalfTy,
+                                 getContext().FloatTy,
+                                 getContext().DoubleTy,
+                                 getContext().LongDoubleTy,
+                                 getContext().Float128Ty,
+                                 getContext().Char8Ty,
+                                 getContext().Char16Ty,
+                                 getContext().Char32Ty};
   llvm::GlobalValue::DLLStorageClassTypes DLLStorageClass =
       RD->hasAttr<DLLExportAttr>() || CGM.shouldMapVisibilityToDLLExport(RD)
           ? llvm::GlobalValue::DLLExportStorageClass
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 209654303a82b..718295cdd116d 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -679,6 +679,9 @@ bool swiftcall::isLegalIntegerType(CodeGenModule &CGM,
   case 128:
     return CGM.getContext().getTargetInfo().hasInt128Type();
 
+  case 256:
+    return CGM.getContext().getTargetInfo().hasInt256Type();
+
   default:
     return false;
   }
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index e6203db8bc245..997a03f77baf1 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -1821,6 +1821,10 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
     } else if (k == BuiltinType::Int128 || k == BuiltinType::UInt128) {
       Lo = Integer;
       Hi = Integer;
+    } else if (k == BuiltinType::Int256 || k == BuiltinType::UInt256) {
+      // Exceeds 2 eightbytes; cannot be classified in registers per SysV ABI.
+      Lo = Memory;
+      Hi = Memory;
     } else if (k >= BuiltinType::Bool && k <= BuiltinType::LongLong) {
       Current = Integer;
     } else if (k == BuiltinType::Float || k == BuiltinType::Double ||
@@ -1926,7 +1930,9 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
       // gcc passes 256 and 512 bit <X x __int128> vectors in memory. :(
       if (passInt128VectorsInMem() && Size != 128 &&
           (ElementType->isSpecificBuiltinType(BuiltinType::Int128) ||
-           ElementType->isSpecificBuiltinType(BuiltinType::UInt128)))
+           ElementType->isSpecificBuiltinType(BuiltinType::UInt128) ||
+           ElementType->isSpecificBuiltinType(BuiltinType::Int256) ||
+           ElementType->isSpecificBuiltinType(BuiltinType::UInt256)))
         return;
 
       // Arguments of 256-bits are split into four eightbyte chunks. The
@@ -2186,7 +2192,10 @@ ABIArgInfo X86_64ABIInfo::getIndirectReturnResult(QualType Ty) const {
     if (const auto *ED = Ty->getAsEnumDecl())
       Ty = ED->getIntegerType();
 
-    if (Ty->isBitIntType())
+    // Types that exceed two eightbytes (128 bits) cannot be returned in
+    // registers per the SysV ABI.  Route them through the indirect path
+    // just like _BitInt.
+    if (Ty->isBitIntType() || getContext().getTypeSize(Ty) > 128)
       return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
 
     return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
@@ -2205,7 +2214,9 @@ bool X86_64ABIInfo::IsIllegalVectorType(QualType Ty) const {
     QualType EltTy = VecTy->getElementType();
     if (passInt128VectorsInMem() &&
         (EltTy->isSpecificBuiltinType(BuiltinType::Int128) ||
-         EltTy->isSpecificBuiltinType(BuiltinType::UInt128)))
+         EltTy->isSpecificBuiltinType(BuiltinType::UInt128) ||
+         EltTy->isSpecificBuiltinType(BuiltinType::Int256) ||
+         EltTy->isSpecificBuiltinType(BuiltinType::UInt256)))
       return true;
   }
 
@@ -2222,8 +2233,11 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
   // the argument in the free register. This does not seem to happen currently,
   // but this code would be much safer if we could mark the argument with
   // 'onstack'. See PR12193.
+  // Scalar types that fit in two eightbytes (128 bits) can be passed in
+  // registers naturally.  Larger scalar types (e.g. __int256) exceed the
+  // SysV ABI register-passing limit and must go through the indirect path.
   if (!isAggregateTypeForABI(Ty) && !IsIllegalVectorType(Ty) &&
-      !Ty->isBitIntType()) {
+      !Ty->isBitIntType() && getContext().getTypeSize(Ty) <= 128) {
     // Treat an enum type as its underlying type.
     if (const auto *ED = Ty->getAsEnumDecl())
       Ty = ED->getIntegerType();
@@ -3420,6 +3434,13 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
       return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
           llvm::Type::getInt64Ty(getVMContext()), 2));
 
+    case BuiltinType::Int256:
+    case BuiltinType::UInt256:
+      // > 8 bytes non-float/vector: passed indirectly on Win64.
+      return ABIArgInfo::getIndirect(
+          Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
+          /*ByVal=*/false);
+
     default:
       break;
     }
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 1ccd74314f373..217f78886835f 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1141,6 +1141,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
                    TI.getTypeWidth(TI.getWIntType()), TI, Builder);
   if (TI.hasInt128Type())
     DefineTypeSizeof("__SIZEOF_INT128__", 128, TI, Builder);
+  if (TI.hasInt256Type())
+    DefineTypeSizeof("__SIZEOF_INT256__", 256, TI, Builder);
 
   DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
   DefineFmt(LangOpts, "__INTMAX", TI.getIntMaxType(), TI, Builder);
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index e3649631ac8d3..e810d68830e94 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -714,6 +714,9 @@ void USRGenerator::VisitType(QualType T) {
           Out << 'k'; break;
         case BuiltinType::UInt128:
           Out << 'j'; break;
+        case BuiltinType::UInt256:
+          Out << "@BT at UInt256";
+          break;
         case BuiltinType::Char_U:
         case BuiltinType::Char_S:
           Out << 'C'; break;
@@ -732,6 +735,9 @@ void USRGenerator::VisitType(QualType T) {
           Out << 'K'; break;
         case BuiltinType::Int128:
           Out << 'J'; break;
+        case BuiltinType::Int256:
+          Out << "@BT at Int256";
+          break;
         case BuiltinType::Float16:
         case BuiltinType::Half:
           Out << 'h'; break;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index d0d006a78274e..f90ca97b54457 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -4315,6 +4315,10 @@ void Parser::ParseDeclarationSpecifiers(
       isInvalid = DS.SetTypeSpecType(DeclSpec::TST_int128, Loc, PrevSpec,
                                      DiagID, Policy);
       break;
+    case tok::kw___int256:
+      isInvalid = DS.SetTypeSpecType(DeclSpec::TST_int256, Loc, PrevSpec,
+                                     DiagID, Policy);
+      break;
     case tok::kw_half:
       isInvalid = DS.SetTypeSpecType(DeclSpec::TST_half, Loc, PrevSpec,
                                      DiagID, Policy);
@@ -5534,6 +5538,7 @@ bool Parser::isKnownToBeTypeSpecifier(const Token &Tok) const {
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw_signed:
   case tok::kw_unsigned:
   case tok::kw__Complex:
@@ -5618,6 +5623,7 @@ bool Parser::isTypeSpecifierQualifier() {
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw_signed:
   case tok::kw_unsigned:
   case tok::kw__Complex:
@@ -5836,6 +5842,7 @@ bool Parser::isDeclarationSpecifier(
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw_signed:
   case tok::kw_unsigned:
   case tok::kw__Complex:
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index be6c7824cdbae..17b97d5347794 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1331,6 +1331,7 @@ Parser::ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand,
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw__ExtInt:
   case tok::kw__BitInt:
   case tok::kw_signed:
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index b3d50daf66b10..fc0594e2b0638 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -2134,6 +2134,9 @@ void Parser::ParseCXXSimpleTypeSpecifier(DeclSpec &DS) {
   case tok::kw___int128:
     DS.SetTypeSpecType(DeclSpec::TST_int128, Loc, PrevSpec, DiagID, Policy);
     break;
+  case tok::kw___int256:
+    DS.SetTypeSpecType(DeclSpec::TST_int256, Loc, PrevSpec, DiagID, Policy);
+    break;
   case tok::kw___bf16:
     DS.SetTypeSpecType(DeclSpec::TST_BFloat16, Loc, PrevSpec, DiagID, Policy);
     break;
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 3af20ce66a5d1..41eb17706c614 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1485,6 +1485,7 @@ Parser::isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename,
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw_signed:
   case tok::kw_unsigned:
   case tok::kw_half:
@@ -1613,6 +1614,7 @@ bool Parser::isCXXDeclarationSpecifierAType() {
   case tok::kw_long:
   case tok::kw___int64:
   case tok::kw___int128:
+  case tok::kw___int256:
   case tok::kw_signed:
   case tok::kw_unsigned:
   case tok::kw_half:
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index 479a959e0aadc..22badb8739ca6 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -336,6 +336,7 @@ bool Declarator::isDeclarationOfFunction() const {
     case TST_half:
     case TST_int:
     case TST_int128:
+    case TST_int256:
     case TST_bitint:
     case TST_struct:
     case TST_interface:
@@ -541,6 +542,8 @@ const char *DeclSpec::getSpecifierName(DeclSpec::TST T,
   case DeclSpec::TST_char32:      return "char32_t";
   case DeclSpec::TST_int:         return "int";
   case DeclSpec::TST_int128:      return "__int128";
+  case DeclSpec::TST_int256:
+    return "__int256";
   case DeclSpec::TST_bitint:      return "_BitInt";
   case DeclSpec::TST_half:        return "half";
   case DeclSpec::TST_float:       return "float";
@@ -1300,8 +1303,9 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
     if (TypeSpecType == TST_unspecified)
       TypeSpecType = TST_int; // unsigned -> unsigned int, signed -> signed int.
     else if (TypeSpecType != TST_int && TypeSpecType != TST_int128 &&
-             TypeSpecType != TST_char && TypeSpecType != TST_wchar &&
-             !IsFixedPointType && TypeSpecType != TST_bitint) {
+             TypeSpecType != TST_int256 && TypeSpecType != TST_char &&
+             TypeSpecType != TST_wchar && !IsFixedPointType &&
+             TypeSpecType != TST_bitint) {
       S.Diag(TSSLoc, diag::err_invalid_sign_spec)
         << getSpecifierName((TST)TypeSpecType, Policy);
       // signed double -> double.
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 3065b5e1e66d3..7f69d2f975360 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -406,6 +406,18 @@ void Sema::Initialize() {
       PushOnScopeChains(Context.getUInt128Decl(), TUScope);
   }
 
+  // Initialize predefined 256-bit integer types, if needed.
+  if (Context.getTargetInfo().hasInt256Type() ||
+      (Context.getAuxTargetInfo() &&
+       Context.getAuxTargetInfo()->hasInt256Type())) {
+    DeclarationName Int256 = &Context.Idents.get("__int256_t");
+    if (IdResolver.begin(Int256) == IdResolver.end())
+      PushOnScopeChains(Context.getInt256Decl(), TUScope);
+
+    DeclarationName UInt256 = &Context.Idents.get("__uint256_t");
+    if (IdResolver.begin(UInt256) == IdResolver.end())
+      PushOnScopeChains(Context.getUInt256Decl(), TUScope);
+  }
 
   // Initialize predefined Objective-C types:
   if (getLangOpts().ObjC) {
@@ -2206,6 +2218,8 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
         (Ty->isIbm128Type() && !Context.getTargetInfo().hasIbm128Type()) ||
         (Ty->isIntegerType() && Context.getTypeSize(Ty) == 128 &&
          !Context.getTargetInfo().hasInt128Type()) ||
+        (Ty->isIntegerType() && Context.getTypeSize(Ty) == 256 &&
+         !Context.getTargetInfo().hasInt256Type()) ||
         (Ty->isBFloat16Type() && !Context.getTargetInfo().hasBFloat16Type() &&
          !LangOpts.CUDAIsDevice) ||
         LongDoubleMismatched) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a49e3883a35a5..3498882d4240f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -419,11 +419,11 @@ static bool BuiltinOverflow(Sema &S, CallExpr *TheCall, unsigned BuiltinID) {
     // bool, a bit-precise type, or an enumeration type.
     if (const auto *BT = QT.getCanonicalType()->getAs<BuiltinType>())
       return (BT->getKind() >= BuiltinType::Short &&
-           BT->getKind() <= BuiltinType::Int128) || (
-           BT->getKind() >= BuiltinType::UShort &&
-           BT->getKind() <= BuiltinType::UInt128) ||
-           BT->getKind() == BuiltinType::UChar ||
-           BT->getKind() == BuiltinType::SChar;
+              BT->getKind() <= BuiltinType::Int256) ||
+             (BT->getKind() >= BuiltinType::UShort &&
+              BT->getKind() <= BuiltinType::UInt256) ||
+             BT->getKind() == BuiltinType::UChar ||
+             BT->getKind() == BuiltinType::SChar;
     return false;
   };
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index e5c4c59e9ffbb..8eeee8e004a19 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9480,6 +9480,10 @@ class BuiltinOperatorOverloadBuilder {
         (S.Context.getAuxTargetInfo() &&
          S.Context.getAuxTargetInfo()->hasInt128Type()))
       ArithmeticTypes.push_back(S.Context.Int128Ty);
+    if (S.Context.getTargetInfo().hasInt256Type() ||
+        (S.Context.getAuxTargetInfo() &&
+         S.Context.getAuxTargetInfo()->hasInt256Type()))
+      ArithmeticTypes.push_back(S.Context.Int256Ty);
     ArithmeticTypes.push_back(S.Context.UnsignedIntTy);
     ArithmeticTypes.push_back(S.Context.UnsignedLongTy);
     ArithmeticTypes.push_back(S.Context.UnsignedLongLongTy);
@@ -9487,6 +9491,10 @@ class BuiltinOperatorOverloadBuilder {
         (S.Context.getAuxTargetInfo() &&
          S.Context.getAuxTargetInfo()->hasInt128Type()))
       ArithmeticTypes.push_back(S.Context.UnsignedInt128Ty);
+    if (S.Context.getTargetInfo().hasInt256Type() ||
+        (S.Context.getAuxTargetInfo() &&
+         S.Context.getAuxTargetInfo()->hasInt256Type()))
+      ArithmeticTypes.push_back(S.Context.UnsignedInt256Ty);
 
     /// We add candidates for the unique, unqualified _BitInt types present in
     /// the candidate type set. The candidate set already handled ensuring the
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 5b1aad3fa8470..1fc60f08f6a21 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -1167,6 +1167,7 @@ bool Sema::containsUnexpandedParameterPacks(Declarator &D) {
   case TST_char32:
   case TST_int:
   case TST_int128:
+  case TST_int256:
   case TST_half:
   case TST_float:
   case TST_double:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index c082dd85f345f..036f9151497f5 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1148,6 +1148,15 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
     else
       Result = Context.Int128Ty;
     break;
+  case DeclSpec::TST_int256:
+    if (!S.Context.getTargetInfo().hasInt256Type() &&
+        !(S.getLangOpts().isTargetDevice()))
+      S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported) << "__int256";
+    if (DS.getTypeSpecSign() == TypeSpecifierSign::Unsigned)
+      Result = Context.UnsignedInt256Ty;
+    else
+      Result = Context.Int256Ty;
+    break;
   case DeclSpec::TST_float16:
     // CUDA host and device may have different _Float16 support, therefore
     // do not diagnose _Float16 usage to avoid false alarm.
@@ -10201,18 +10210,21 @@ static QualType ChangeIntegralSignedness(Sema &S, QualType BaseType,
   }
 
   bool Int128Unsupported = !S.Context.getTargetInfo().hasInt128Type();
-  std::array<CanQualType *, 6> AllSignedIntegers = {
+  bool Int256Unsupported = !S.Context.getTargetInfo().hasInt256Type();
+  unsigned IntSkip = Int128Unsupported ? 2 : Int256Unsupported ? 1 : 0;
+  std::array<CanQualType *, 7> AllSignedIntegers = {
       &S.Context.SignedCharTy, &S.Context.ShortTy,    &S.Context.IntTy,
-      &S.Context.LongTy,       &S.Context.LongLongTy, &S.Context.Int128Ty};
+      &S.Context.LongTy,       &S.Context.LongLongTy, &S.Context.Int128Ty,
+      &S.Context.Int256Ty};
   ArrayRef<CanQualType *> AvailableSignedIntegers(
-      AllSignedIntegers.data(), AllSignedIntegers.size() - Int128Unsupported);
-  std::array<CanQualType *, 6> AllUnsignedIntegers = {
+      AllSignedIntegers.data(), AllSignedIntegers.size() - IntSkip);
+  std::array<CanQualType *, 7> AllUnsignedIntegers = {
       &S.Context.UnsignedCharTy,     &S.Context.UnsignedShortTy,
       &S.Context.UnsignedIntTy,      &S.Context.UnsignedLongTy,
-      &S.Context.UnsignedLongLongTy, &S.Context.UnsignedInt128Ty};
-  ArrayRef<CanQualType *> AvailableUnsignedIntegers(AllUnsignedIntegers.data(),
-                                                    AllUnsignedIntegers.size() -
-                                                        Int128Unsupported);
+      &S.Context.UnsignedLongLongTy, &S.Context.UnsignedInt128Ty,
+      &S.Context.UnsignedInt256Ty};
+  ArrayRef<CanQualType *> AvailableUnsignedIntegers(
+      AllUnsignedIntegers.data(), AllUnsignedIntegers.size() - IntSkip);
   ArrayRef<CanQualType *> *Consider =
       IsMakeSigned ? &AvailableSignedIntegers : &AvailableUnsignedIntegers;
 
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index a416c73c458b2..dbab14eca9b92 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -17510,10 +17510,10 @@ QualType TreeTransform<Derived>::RebuildArrayType(
                                   getDerived().getBaseEntity());
 
   QualType Types[] = {
-    SemaRef.Context.UnsignedCharTy, SemaRef.Context.UnsignedShortTy,
-    SemaRef.Context.UnsignedIntTy, SemaRef.Context.UnsignedLongTy,
-    SemaRef.Context.UnsignedLongLongTy, SemaRef.Context.UnsignedInt128Ty
-  };
+      SemaRef.Context.UnsignedCharTy,     SemaRef.Context.UnsignedShortTy,
+      SemaRef.Context.UnsignedIntTy,      SemaRef.Context.UnsignedLongTy,
+      SemaRef.Context.UnsignedLongLongTy, SemaRef.Context.UnsignedInt128Ty,
+      SemaRef.Context.UnsignedInt256Ty};
   QualType SizeType;
   for (const auto &T : Types)
     if (Size->getBitWidth() == SemaRef.Context.getIntWidth(T)) {
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index 69db02f2efc40..b91a8547fa2f8 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -53,6 +53,9 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
   case BuiltinType::UInt128:
     ID = PREDEF_TYPE_UINT128_ID;
     break;
+  case BuiltinType::UInt256:
+    ID = PREDEF_TYPE_UINT256_ID;
+    break;
   case BuiltinType::Char_S:
     ID = PREDEF_TYPE_CHAR_S_ID;
     break;
@@ -78,6 +81,9 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
   case BuiltinType::Int128:
     ID = PREDEF_TYPE_INT128_ID;
     break;
+  case BuiltinType::Int256:
+    ID = PREDEF_TYPE_INT256_ID;
+    break;
   case BuiltinType::Half:
     ID = PREDEF_TYPE_HALF_ID;
     break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index bde000234a062..c4c173fb34d3a 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7771,6 +7771,9 @@ QualType ASTReader::GetType(TypeID ID) {
     case PREDEF_TYPE_UINT128_ID:
       T = Context.UnsignedInt128Ty;
       break;
+    case PREDEF_TYPE_UINT256_ID:
+      T = Context.UnsignedInt256Ty;
+      break;
     case PREDEF_TYPE_SCHAR_ID:
       T = Context.SignedCharTy;
       break;
@@ -7792,6 +7795,9 @@ QualType ASTReader::GetType(TypeID ID) {
     case PREDEF_TYPE_INT128_ID:
       T = Context.Int128Ty;
       break;
+    case PREDEF_TYPE_INT256_ID:
+      T = Context.Int256Ty;
+      break;
     case PREDEF_TYPE_BFLOAT16_ID:
       T = Context.BFloat16Ty;
       break;
@@ -8360,6 +8366,18 @@ Decl *ASTReader::getPredefinedDecl(PredefinedDeclIDs ID) {
     NewLoaded = Context.getUInt128Decl();
     break;
 
+  case PREDEF_DECL_INT_256_ID:
+    if (Context.Int256Decl)
+      return Context.Int256Decl;
+    NewLoaded = Context.getInt256Decl();
+    break;
+
+  case PREDEF_DECL_UNSIGNED_INT_256_ID:
+    if (Context.UInt256Decl)
+      return Context.UInt256Decl;
+    NewLoaded = Context.getUInt256Decl();
+    break;
+
   case PREDEF_DECL_OBJC_INSTANCETYPE_ID:
     if (Context.ObjCInstanceTypeDecl)
       return Context.ObjCInstanceTypeDecl;
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index af46f84d5aac0..dfba5b62ebdd9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5641,6 +5641,8 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &SemaRef) {
                      PREDEF_DECL_OBJC_PROTOCOL_ID);
   RegisterPredefDecl(Context.Int128Decl, PREDEF_DECL_INT_128_ID);
   RegisterPredefDecl(Context.UInt128Decl, PREDEF_DECL_UNSIGNED_INT_128_ID);
+  RegisterPredefDecl(Context.Int256Decl, PREDEF_DECL_INT_256_ID);
+  RegisterPredefDecl(Context.UInt256Decl, PREDEF_DECL_UNSIGNED_INT_256_ID);
   RegisterPredefDecl(Context.ObjCInstanceTypeDecl,
                      PREDEF_DECL_OBJC_INSTANCETYPE_ID);
   RegisterPredefDecl(Context.BuiltinVaListDecl, PREDEF_DECL_BUILTIN_VA_LIST_ID);

>From 7503a37c8b58f2527926c01c56816d49ebe67884 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:36 +0100
Subject: [PATCH 03/17] [clang][test] Add __int256/__uint256 clang tests

Comprehensive test coverage for the __int256 builtin type:
- AST: constant interpreter, JSON dump updates for new predefined type ID
- CodeGen: AArch64/X86 argument passing, data layout, debug info, float
  conversions, overflow builtins, varargs, mangling (Itanium/MSVC)
- Sema: type acceptance on 64-bit targets, atomic/bitfield constraints,
  constant evaluation, struct layout, tautological comparisons,
  templates, type traits, overload resolution
- CUDA: device-side acceptance
- SYCL: spir64 rejection
- Preprocessor: __INT256_MAX__ and related macros

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 clang/test/AST/ByteCode/int256.cpp            | 201 +++++++++++++++
 clang/test/AST/ast-dump-default-arg-json.cpp  |  46 ++++
 clang/test/AST/ast-dump-default-init-json.cpp |  46 ++++
 clang/test/AST/ast-dump-file-line-json.c      |  46 ++++
 clang/test/AST/ast-dump-lambda-json.cpp       |  46 ++++
 .../test/AST/ast-dump-template-decls-json.cpp |  46 ++++
 ...dump-template-json-win32-mangler-crash.cpp |  46 ++++
 clang/test/AST/ast-dump-templates.cpp         |  46 ++++
 clang/test/CXX/drs/cwg4xx.cpp                 |   2 +
 .../AArch64/aarch64-arguments-int256.c        |  39 +++
 .../CodeGen/AArch64/aarch64-int256-args.c     |  30 +++
 clang/test/CodeGen/X86/win64-int256.c         |  22 ++
 clang/test/CodeGen/X86/x86_64-PR42672.c       |   4 +-
 .../CodeGen/X86/x86_64-arguments-int256.c     |  39 +++
 clang/test/CodeGen/X86/x86_64-atomic-i256.c   |  32 +++
 clang/test/CodeGen/debug-info-int256.c        |  12 +
 clang/test/CodeGen/float-conv-int256.c        |  63 +++++
 clang/test/CodeGen/int256-func-ptr.c          |  34 +++
 clang/test/CodeGen/int256-globals.c           |  39 +++
 clang/test/CodeGen/overflow-builtins-int256.c |  59 +++++
 clang/test/CodeGen/uint256_t.c                | 239 ++++++++++++++++++
 clang/test/CodeGen/varargs-int256.c           |  67 +++++
 clang/test/CodeGenCXX/mangle-int256.cpp       |  32 +++
 clang/test/Modules/decl-params-determinisim.m |  16 +-
 clang/test/Preprocessor/init-aarch64.c        |   1 +
 clang/test/Preprocessor/init.c                |   1 +
 clang/test/Sema/256bitint.c                   |  72 ++++++
 clang/test/Sema/atomic-builtins-int256.c      |  29 +++
 clang/test/Sema/atomic-int256.c               |  26 ++
 clang/test/Sema/bitfield-int256.c             |  42 +++
 clang/test/Sema/const-eval.c                  |   5 +
 clang/test/Sema/constant-builtins-2.c         |  15 ++
 clang/test/Sema/enum.c                        |   4 +-
 clang/test/Sema/struct-layout-int256.c        |  70 +++++
 .../test/Sema/tautological-constant-compare.c |   5 +
 clang/test/Sema/types.c                       |  25 ++
 clang/test/SemaCUDA/int256.cu                 |  30 +++
 clang/test/SemaCXX/deleted-operator.cpp       |   4 +-
 clang/test/SemaCXX/int256-templates.cpp       | 219 ++++++++++++++++
 clang/test/SemaCXX/int256-type-traits.cpp     |  74 ++++++
 .../SemaCXX/overloaded-builtin-operators.cpp  |   4 +-
 clang/test/SemaSYCL/int256.cpp                |  74 ++++++
 42 files changed, 1938 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/AST/ByteCode/int256.cpp
 create mode 100644 clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
 create mode 100644 clang/test/CodeGen/AArch64/aarch64-int256-args.c
 create mode 100644 clang/test/CodeGen/X86/win64-int256.c
 create mode 100644 clang/test/CodeGen/X86/x86_64-arguments-int256.c
 create mode 100644 clang/test/CodeGen/X86/x86_64-atomic-i256.c
 create mode 100644 clang/test/CodeGen/debug-info-int256.c
 create mode 100644 clang/test/CodeGen/float-conv-int256.c
 create mode 100644 clang/test/CodeGen/int256-func-ptr.c
 create mode 100644 clang/test/CodeGen/int256-globals.c
 create mode 100644 clang/test/CodeGen/overflow-builtins-int256.c
 create mode 100644 clang/test/CodeGen/uint256_t.c
 create mode 100644 clang/test/CodeGen/varargs-int256.c
 create mode 100644 clang/test/CodeGenCXX/mangle-int256.cpp
 create mode 100644 clang/test/Sema/256bitint.c
 create mode 100644 clang/test/Sema/atomic-builtins-int256.c
 create mode 100644 clang/test/Sema/atomic-int256.c
 create mode 100644 clang/test/Sema/bitfield-int256.c
 create mode 100644 clang/test/Sema/struct-layout-int256.c
 create mode 100644 clang/test/SemaCUDA/int256.cu
 create mode 100644 clang/test/SemaCXX/int256-templates.cpp
 create mode 100644 clang/test/SemaCXX/int256-type-traits.cpp
 create mode 100644 clang/test/SemaSYCL/int256.cpp

diff --git a/clang/test/AST/ByteCode/int256.cpp b/clang/test/AST/ByteCode/int256.cpp
new file mode 100644
index 0000000000000..7ec7901a04d3f
--- /dev/null
+++ b/clang/test/AST/ByteCode/int256.cpp
@@ -0,0 +1,201 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple x86_64-unknown-linux-gnu -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -verify=ref,both %s
+
+// Constexpr evaluation tests for __int256_t / __uint256_t.
+
+namespace Arithmetic {
+  constexpr __int256_t a = 100;
+  constexpr __int256_t b = 7;
+  static_assert(a + b == 107, "");
+  static_assert(a - b == 93, "");
+  static_assert(a * b == 700, "");
+  static_assert(a / b == 14, "");
+  static_assert(a % b == 2, "");
+
+  constexpr __int256_t product = 12345 * 67890;
+  static_assert(product == 838102050, "");
+}
+
+namespace Bitwise {
+  constexpr __uint256_t x = 0xFF00FF;
+  constexpr __uint256_t y = 0x0F0F0F;
+  static_assert((x & y) == 0x0F000F, "");
+  static_assert((x | y) == 0xFF0FFF, "");
+  static_assert((x ^ y) == 0xF00FF0, "");
+  static_assert(~(__uint256_t)0 != 0, "");
+}
+
+namespace Shifts {
+  constexpr __int256_t one = 1;
+  static_assert((one << 0) == 1, "");
+  static_assert((one << 1) == 2, "");
+  static_assert((one << 64) != 0, "");
+  static_assert((one << 128) != 0, "");
+  static_assert((one << 255) != 0, "");
+  static_assert(((__uint256_t)one << 255) >> 255 == 1, "");
+
+  constexpr __uint256_t large = (__uint256_t)1 << 200;
+  static_assert(large != 0, "");
+  static_assert(large >> 200 == 1, "");
+}
+
+namespace Comparisons {
+  constexpr __int256_t a = 100;
+  constexpr __int256_t b = 7;
+  static_assert(a > b, "");
+  static_assert(b < a, "");
+  static_assert(a >= 100, "");
+  static_assert(b <= 7, "");
+  static_assert(a != b, "");
+  static_assert(a == 100, "");
+}
+
+namespace Conversions {
+  constexpr __int128_t i128 = 42;
+  constexpr __int256_t from128 = i128;
+  static_assert(from128 == 42, "");
+  constexpr __int128_t to128 = (__int128_t)from128;
+  static_assert(to128 == 42, "");
+
+  constexpr long long ll = 99;
+  constexpr __int256_t fromll = ll;
+  static_assert(fromll == 99, "");
+}
+
+namespace UnaryOps {
+  constexpr __int256_t a = 100;
+  constexpr __int256_t neg = -a;
+  static_assert(neg == -100, "");
+  static_assert(-neg == 100, "");
+}
+
+namespace Wrapping {
+  constexpr __uint256_t zero = 0;
+  constexpr __uint256_t wrap = zero - 1;
+  static_assert(wrap + 1 == 0, "");
+}
+
+namespace DivByZero {
+  constexpr __int256_t divzero = __int256_t{1} / __int256_t{0}; // both-error {{must be initialized by a constant expression}} \
+                                                                  // both-note {{division by zero}}
+  constexpr __int256_t remzero = __int256_t{1} % __int256_t{0}; // both-error {{must be initialized by a constant expression}} \
+                                                                  // both-note {{division by zero}}
+}
+
+namespace BoundaryConstants {
+  // UINT256_MAX = 2^256 - 1 = ((__uint256_t)1 << 255) | (((__uint256_t)1 << 255) - 1)
+  constexpr __uint256_t UINT256_MAX = ~(__uint256_t)0;
+  static_assert(UINT256_MAX != 0, "");
+  static_assert(UINT256_MAX + 1 == 0, ""); // wraps to zero
+  static_assert((UINT256_MAX >> 255) == 1, "");
+
+  // INT256_MAX = 2^255 - 1 (sign bit clear, all other bits set)
+  constexpr __int256_t INT256_MAX = (__int256_t)(UINT256_MAX >> 1);
+  static_assert(INT256_MAX > 0, "");
+  constexpr __uint256_t check_max = (__uint256_t)INT256_MAX;
+  static_assert((check_max >> 254) == 1, ""); // bit 254 set
+
+  // INT256_MIN = -2^255 (sign bit set, all other bits clear)
+  constexpr __int256_t INT256_MIN = -INT256_MAX - 1;
+  static_assert(INT256_MIN < 0, "");
+  static_assert(INT256_MIN + INT256_MAX == -1, "");
+
+  // Full-width values using all 256 bits
+  constexpr __uint256_t all_ones = ~(__uint256_t)0;
+  constexpr __uint256_t alternating = all_ones / 3; // 0x5555...
+  static_assert(alternating != 0, "");
+  static_assert((alternating & (alternating << 1)) == 0, ""); // no adjacent bits
+}
+
+namespace OverflowDetection {
+  // Signed overflow in constexpr is undefined behavior -- not a constant expression
+  constexpr __int256_t INT256_MAX = (__int256_t)(~(__uint256_t)0 >> 1);
+  constexpr __int256_t overflow_add = INT256_MAX + 1; // both-error {{must be initialized by a constant expression}} \
+                                                       // both-note {{value 57896044618658097711785492504343953926634992332820282019728792003956564819968 is outside the range of representable values}}
+}
+
+namespace MoreConversions {
+  // Bool conversions
+  constexpr bool from_zero = (__int256_t)0;
+  static_assert(!from_zero, "");
+  constexpr bool from_one = (__int256_t)1;
+  static_assert(from_one, "");
+  constexpr bool from_neg = (__int256_t)-1;
+  static_assert(from_neg, "");
+
+  // Char conversions
+  constexpr char c = 'A';
+  constexpr __int256_t from_char = c;
+  static_assert(from_char == 65, "");
+  constexpr char to_char = (char)from_char;
+  static_assert(to_char == 'A', "");
+
+  // Int conversions
+  constexpr int i = 42;
+  constexpr __int256_t from_int = i;
+  static_assert(from_int == 42, "");
+  constexpr int to_int = (int)from_int;
+  static_assert(to_int == 42, "");
+
+  // Long conversions
+  constexpr long l = 1000000L;
+  constexpr __int256_t from_long = l;
+  static_assert(from_long == 1000000, "");
+
+  // __int256 <-> __int128 round-trip with negative
+  constexpr __int128_t neg128 = -42;
+  constexpr __int256_t from_neg128 = neg128;
+  static_assert(from_neg128 == -42, "");
+  constexpr __int128_t to_neg128 = (__int128_t)from_neg128;
+  static_assert(to_neg128 == -42, "");
+}
+
+namespace CompoundAssignment {
+  constexpr __int256_t test_compound() {
+    __int256_t x = 100;
+    x += 50;   // 150
+    x -= 30;   // 120
+    x *= 2;    // 240
+    x /= 3;    // 80
+    x %= 7;    // 3
+    x <<= 4;   // 48
+    x >>= 2;   // 12
+    x &= 0xFF; // 12
+    x |= 0x100;// 268
+    x ^= 0xF;  // 263
+    return x;
+  }
+  static_assert(test_compound() == 259, "");
+}
+
+namespace IncrementDecrement {
+  constexpr __int256_t test_inc_dec() {
+    __int256_t x = 0;
+    ++x;      // 1
+    x++;      // 2
+    --x;      // 1
+    x--;      // 0
+    return x;
+  }
+  static_assert(test_inc_dec() == 0, "");
+
+  // Unsigned wrapping with decrement
+  constexpr __uint256_t test_wrap_dec() {
+    __uint256_t x = 0;
+    --x; // wraps to UINT256_MAX
+    ++x; // wraps back to 0
+    return x;
+  }
+  static_assert(test_wrap_dec() == 0, "");
+}
+
+namespace ConstexprFunc {
+  constexpr __int256_t factorial(__int256_t n) {
+    __int256_t result = 1;
+    for (__int256_t i = 2; i <= n; ++i)
+      result *= i;
+    return result;
+  }
+  static_assert(factorial(10) == 3628800, "");
+  static_assert(factorial(20) == 2432902008176640000LL, "");
+}
diff --git a/clang/test/AST/ast-dump-default-arg-json.cpp b/clang/test/AST/ast-dump-default-arg-json.cpp
index b6a138934caf9..b34b1cbafc924 100644
--- a/clang/test/AST/ast-dump-default-arg-json.cpp
+++ b/clang/test/AST/ast-dump-default-arg-json.cpp
@@ -80,6 +80,52 @@ void test() {
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp
index f4949a9c9eedf..50d1100ba11c6 100644
--- a/clang/test/AST/ast-dump-default-init-json.cpp
+++ b/clang/test/AST/ast-dump-default-init-json.cpp
@@ -78,6 +78,52 @@ void test() {
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-file-line-json.c b/clang/test/AST/ast-dump-file-line-json.c
index da1c8dbd755d5..1b5e8de80df79 100644
--- a/clang/test/AST/ast-dump-file-line-json.c
+++ b/clang/test/AST/ast-dump-file-line-json.c
@@ -76,6 +76,52 @@ int e;
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "struct __NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-lambda-json.cpp b/clang/test/AST/ast-dump-lambda-json.cpp
index fc28cc8164e17..3616426aa6f8a 100644
--- a/clang/test/AST/ast-dump-lambda-json.cpp
+++ b/clang/test/AST/ast-dump-lambda-json.cpp
@@ -87,6 +87,52 @@ void Test() {
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-template-decls-json.cpp b/clang/test/AST/ast-dump-template-decls-json.cpp
index 70f1d3b55f3ee..0e4e6b3ab4544 100644
--- a/clang/test/AST/ast-dump-template-decls-json.cpp
+++ b/clang/test/AST/ast-dump-template-decls-json.cpp
@@ -128,6 +128,52 @@ W(int)->W<1>;
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index 43eae10b27b3a..54e9040740786 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -137,6 +137,52 @@ int main()
 // CHECK-NEXT:     "end": {}
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint256_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int256"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int256"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "name": "__NSConstantString",
 // CHECK-NEXT:    "type": {
 // CHECK-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index 8cf9b6a29e332..377202e07fb71 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -343,6 +343,52 @@ namespace TestAbbreviatedTemplateDecls {
 // JSON-NEXT:     "end": {}
 // JSON-NEXT:    },
 // JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__int256_t",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "__int256"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "BuiltinType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "__int256"
+// JSON-NEXT:      }
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
+// JSON-NEXT:    "name": "__uint256_t",
+// JSON-NEXT:    "type": {
+// JSON-NEXT:     "qualType": "unsigned __int256"
+// JSON-NEXT:    },
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "BuiltinType",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "unsigned __int256"
+// JSON-NEXT:      }
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "TypedefDecl",
+// JSON-NEXT:    "loc": {},
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {},
+// JSON-NEXT:     "end": {}
+// JSON-NEXT:    },
+// JSON-NEXT:    "isImplicit": true,
 // JSON-NEXT:    "name": "__NSConstantString",
 // JSON-NEXT:    "type": {
 // JSON-NEXT:     "qualType": "__NSConstantString_tag"
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 44385224aa388..920bbac855285 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -545,6 +545,8 @@ namespace cwg425 { // cwg425: 2.7
   //   expected-note-re at -11 {{built-in candidate operator*{{.*}}}}
   //   expected-note-re at -12 {{built-in candidate operator*{{.*}}}}
   //   expected-note-re at -13 {{built-in candidate operator*{{.*}}}}
+  //   expected-note-re at -14 {{built-in candidate operator*{{.*}}}}
+  //   expected-note-re at -15 {{built-in candidate operator*{{.*}}}}
 
   template<typename T> struct is_float;
   template<> struct is_float<float> { typedef void type; };
diff --git a/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c b/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
new file mode 100644
index 0000000000000..e2a63645dc918
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify AArch64 IR generation for __int256_t arguments and returns.
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256(i256 noundef %a)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256u(i256 noundef %a)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} i256 @f_two256(i256 noundef %a, i256 noundef %b)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i64 noundef %x, i256 noundef %a, i32 noundef %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// Register exhaustion: 3 i256 args still passed directly
+// CHECK-LABEL: define{{.*}} i256 @f_three256(i256 noundef %a, i256 noundef %b, i256 noundef %c)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/indirect
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256: also indirect
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
+
+// Packed struct with __int256
+struct __attribute__((packed)) packed256 { char c; __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_packed256(ptr dead_on_unwind noalias writable sret(%struct.packed256) align 1 %{{.*}}, ptr noundef dead_on_return %s)
+struct packed256 f_packed256(struct packed256 s) { return s; }
diff --git a/clang/test/CodeGen/AArch64/aarch64-int256-args.c b/clang/test/CodeGen/AArch64/aarch64-int256-args.c
new file mode 100644
index 0000000000000..9ee68e9583c5a
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/aarch64-int256-args.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify AArch64 handles many __int256 arguments (register exhaustion).
+// Each __int256 consumes 4 GPRs (x0-x3, x4-x7), so the 3rd+ arg must
+// spill to the stack when the backend lowers this.
+
+// CHECK-LABEL: define{{.*}} i256 @f_five(i256 noundef %a, i256 noundef %b, i256 noundef %c, i256 noundef %d, i256 noundef %e)
+// CHECK: add nsw i256
+__int256 f_five(__int256 a, __int256 b, __int256 c, __int256 d, __int256 e) {
+  return a + b + c + d + e;
+}
+
+// Mixed argument sizes: smaller args consume individual GPRs, then __int256
+// takes 4 GPRs each.
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i32 noundef %x, i256 noundef %a, i64 noundef %y, i256 noundef %b, i32 noundef %z)
+// CHECK: add nsw i256
+__int256 f_mixed(int x, __int256 a, long long y, __int256 b, int z) {
+  return a + b;
+}
+
+// Struct containing __int256: must go indirect per AAPCS (>16 bytes)
+struct s256 { __int256 val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct(ptr{{.*}}sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct(struct s256 s) { return s; }
+
+// Verify direct scalar __int256 return (even though struct s256 is indirect)
+// CHECK-LABEL: define{{.*}} i256 @f_scalar_ret(i256 noundef %x)
+// CHECK: ret i256
+__int256 f_scalar_ret(__int256 x) { return x; }
diff --git a/clang/test/CodeGen/X86/win64-int256.c b/clang/test/CodeGen/X86/win64-int256.c
new file mode 100644
index 0000000000000..b8767c0211bc4
--- /dev/null
+++ b/clang/test/CodeGen/X86/win64-int256.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=GNU
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+
+// Verify __int256 ABI on Windows targets (both GNU and MSVC).
+// On Win64, __int256 is passed/returned indirectly (pointer args, sret return).
+
+// GNU-LABEL: define dso_local void @f_ret(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0)
+// MSVC-LABEL: define dso_local void @f_ret(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0)
+__int256 f_ret(__int256 a) { return a; }
+
+// GNU-LABEL: define dso_local void @f_two(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0, ptr noundef dead_on_return %1)
+// MSVC-LABEL: define dso_local void @f_two(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0, ptr noundef dead_on_return %1)
+__int256 f_two(__int256 a, __int256 b) { return a + b; }
+
+// GNU-LABEL: define dso_local i32 @f_narrow(ptr noundef dead_on_return %0)
+// MSVC-LABEL: define dso_local i32 @f_narrow(ptr noundef dead_on_return %0)
+int f_narrow(__int256 a) { return (int)a; }
+
+// Mixed: small args passed in registers, __int256 via pointer
+// GNU-LABEL: define dso_local void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, i32 noundef %x, ptr noundef dead_on_return %0, i32 noundef %y)
+// MSVC-LABEL: define dso_local void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, i32 noundef %x, ptr noundef dead_on_return %0, i32 noundef %y)
+__int256 f_mixed(int x, __int256 a, int y) { return a; }
diff --git a/clang/test/CodeGen/X86/x86_64-PR42672.c b/clang/test/CodeGen/X86/x86_64-PR42672.c
index 42894c0c4cb57..3285b084d285f 100644
--- a/clang/test/CodeGen/X86/x86_64-PR42672.c
+++ b/clang/test/CodeGen/X86/x86_64-PR42672.c
@@ -61,10 +61,12 @@ void odd_struct(void) {
 // CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: cannot store value into a register
 
 // Check Clang reports an error if attempting to return a big structure via a register.
+// Use 5 x long long (40 bytes / 320 bits) since 4 x long long (32 bytes / 256 bits)
+// can be represented as __uint256_t on targets with __int256 support.
 void big_struct(void) {
 #ifdef IMPOSSIBLE_BIG
   struct {
-    long long int v1, v2, v3, v4;
+    long long int v1, v2, v3, v4, v5;
   } str;
   asm("nop"
       : "=r"(str));
diff --git a/clang/test/CodeGen/X86/x86_64-arguments-int256.c b/clang/test/CodeGen/X86/x86_64-arguments-int256.c
new file mode 100644
index 0000000000000..86def39e81e95
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86_64-arguments-int256.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify X86-64 IR generation for __int256_t arguments and returns.
+// Per the SysV ABI, types exceeding two eightbytes (128 bits) are passed
+// and returned in memory (sret/byval).
+
+// CHECK-LABEL: define{{.*}} void @f_ret256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} void @f_ret256u(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} void @f_two256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, i64 noundef %x, ptr noundef byval(i256) align 16 %0, i32 noundef %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// 128-bit: still returned directly in registers (2 eightbytes)
+// CHECK-LABEL: define{{.*}} i128 @f_ret128(i128 noundef %a)
+__int128_t f_ret128(__int128_t a) { return a; }
+
+// 3 i256 args: all passed via byval pointers
+// CHECK-LABEL: define{{.*}} void @f_three256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1, ptr noundef byval(i256) align 16 %2)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/byval
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef byval(%struct.s256) align 16 %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef byval(%struct.nested256) align 16 %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
diff --git a/clang/test/CodeGen/X86/x86_64-atomic-i256.c b/clang/test/CodeGen/X86/x86_64-atomic-i256.c
new file mode 100644
index 0000000000000..73aa91d60c5c7
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86_64-atomic-i256.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify that _Atomic __int256 operations generate the correct libcalls.
+// __int256 is too large for inline atomics (256 bits > cmpxchg16b), so all
+// operations must route through __atomic_* libcalls with size=32.
+
+_Atomic __int256_t glob;
+
+// CHECK-LABEL: define{{.*}} void @atomic_load(ptr{{.*}}sret(i256)
+// CHECK: call void @__atomic_load(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, i32 noundef 5)
+__int256_t atomic_load(void) {
+  return __c11_atomic_load(&glob, __ATOMIC_SEQ_CST);
+}
+
+// CHECK-LABEL: define{{.*}} void @atomic_store(ptr noundef byval(i256) align 16 %0)
+// CHECK: call void @__atomic_store(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, i32 noundef 3)
+void atomic_store(__int256_t val) {
+  __c11_atomic_store(&glob, val, __ATOMIC_RELEASE);
+}
+
+// CHECK-LABEL: define{{.*}} void @atomic_exchange(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: call void @__atomic_exchange(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, ptr noundef %{{.*}}, i32 noundef 5)
+__int256_t atomic_exchange(__int256_t val) {
+  return __c11_atomic_exchange(&glob, val, __ATOMIC_SEQ_CST);
+}
+
+// CHECK-LABEL: define{{.*}} i1 @atomic_cas(
+// CHECK: call{{.*}} i1 @__atomic_compare_exchange(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, ptr noundef %{{.*}}, i32 noundef 4, i32 noundef 2)
+_Bool atomic_cas(__int256_t *expected, __int256_t desired) {
+  return __c11_atomic_compare_exchange_strong(
+      &glob, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
diff --git a/clang/test/CodeGen/debug-info-int256.c b/clang/test/CodeGen/debug-info-int256.c
new file mode 100644
index 0000000000000..eeee2dddfd7f6
--- /dev/null
+++ b/clang/test/CodeGen/debug-info-int256.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=standalone -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -debug-info-kind=standalone -emit-llvm -o - %s | FileCheck %s
+
+// Verify DWARF debug info encoding for __int256_t and __uint256_t.
+
+__int256_t s256;
+__uint256_t u256;
+
+// CHECK-DAG: !DIBasicType(name: "__int256", size: 256, encoding: DW_ATE_signed)
+// CHECK-DAG: !DIBasicType(name: "unsigned __int256", size: 256, encoding: DW_ATE_unsigned)
+// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__int256_t"
+// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__uint256_t"
diff --git a/clang/test/CodeGen/float-conv-int256.c b/clang/test/CodeGen/float-conv-int256.c
new file mode 100644
index 0000000000000..639eb9fe956c6
--- /dev/null
+++ b/clang/test/CodeGen/float-conv-int256.c
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Test float <-> __int256_t conversions.
+
+// === Signed -> Float ===
+
+// CHECK-LABEL: define {{.*}}@int256_to_double
+// CHECK: sitofp i256 %{{.*}} to double
+double int256_to_double(__int256_t x) { return (double)x; }
+
+// CHECK-LABEL: define {{.*}}@int256_to_float
+// CHECK: sitofp i256 %{{.*}} to float
+float int256_to_float(__int256_t x) { return (float)x; }
+
+// CHECK-LABEL: define {{.*}}@int256_to_longdouble
+// CHECK: sitofp i256 %{{.*}} to x86_fp80
+long double int256_to_longdouble(__int256_t x) { return (long double)x; }
+
+// === Unsigned -> Float ===
+
+// CHECK-LABEL: define {{.*}}@uint256_to_double
+// CHECK: uitofp i256 %{{.*}} to double
+double uint256_to_double(__uint256_t x) { return (double)x; }
+
+// CHECK-LABEL: define {{.*}}@uint256_to_float
+// CHECK: uitofp i256 %{{.*}} to float
+float uint256_to_float(__uint256_t x) { return (float)x; }
+
+// CHECK-LABEL: define {{.*}}@uint256_to_longdouble
+// CHECK: uitofp i256 %{{.*}} to x86_fp80
+long double uint256_to_longdouble(__uint256_t x) { return (long double)x; }
+
+// === Float -> Signed ===
+
+// CHECK-LABEL: define {{.*}}@double_to_int256
+// CHECK: fptosi double %{{.*}} to i256
+__int256_t double_to_int256(double x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define {{.*}}@float_to_int256
+// CHECK: fptosi float %{{.*}} to i256
+__int256_t float_to_int256(float x) { return (__int256_t)x; }
+
+// === Float -> Unsigned ===
+
+// CHECK-LABEL: define {{.*}}@double_to_uint256
+// CHECK: fptoui double %{{.*}} to i256
+__uint256_t double_to_uint256(double x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define {{.*}}@float_to_uint256
+// CHECK: fptoui float %{{.*}} to i256
+__uint256_t float_to_uint256(float x) { return (__uint256_t)x; }
+
+// === Long Double -> Unsigned ===
+
+// CHECK-LABEL: define {{.*}}@longdouble_to_uint256
+// CHECK: fptoui x86_fp80 %{{.*}} to i256
+__uint256_t longdouble_to_uint256(long double x) { return (__uint256_t)x; }
+
+// === Long Double -> Signed ===
+
+// CHECK-LABEL: define {{.*}}@longdouble_to_int256
+// CHECK: fptosi x86_fp80 %{{.*}} to i256
+__int256_t longdouble_to_int256(long double x) { return (__int256_t)x; }
diff --git a/clang/test/CodeGen/int256-func-ptr.c b/clang/test/CodeGen/int256-func-ptr.c
new file mode 100644
index 0000000000000..e59d2493baba9
--- /dev/null
+++ b/clang/test/CodeGen/int256-func-ptr.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=AARCH64
+
+// Verify __int256 works correctly through function pointers and extern decls.
+
+typedef __int256 (*binop_t)(__int256, __int256);
+typedef int (*pred_t)(__int256, __int256);
+
+// X86-LABEL: define{{.*}} void @call_binop(ptr{{.*}}sret(i256){{.*}}, ptr noundef %fn, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call void %{{.*}}(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i256 @call_binop(ptr noundef %fn, i256 noundef %a, i256 noundef %b)
+// AARCH64: call i256 %{{.*}}(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+__int256 call_binop(binop_t fn, __int256 a, __int256 b) {
+  return fn(a, b);
+}
+
+// X86-LABEL: define{{.*}} i32 @call_pred(ptr noundef %fn, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call i32 %{{.*}}(ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i32 @call_pred(ptr noundef %fn, i256 noundef %a, i256 noundef %b)
+// AARCH64: call i32 %{{.*}}(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+int call_pred(pred_t fn, __int256 a, __int256 b) {
+  return fn(a, b);
+}
+
+// Cross-TU: extern function with __int256 params
+extern __int256 extern_add(__int256 a, __int256 b);
+
+// X86-LABEL: define{{.*}} void @call_extern(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call void @extern_add(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i256 @call_extern(i256 noundef %a, i256 noundef %b)
+// AARCH64: call i256 @extern_add(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+__int256 call_extern(__int256 a, __int256 b) {
+  return extern_add(a, b);
+}
diff --git a/clang/test/CodeGen/int256-globals.c b/clang/test/CodeGen/int256-globals.c
new file mode 100644
index 0000000000000..f974f37402522
--- /dev/null
+++ b/clang/test/CodeGen/int256-globals.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify __int256 global/static/extern variable declarations and access.
+
+// CHECK-DAG: @global_s = global i256 0, align 16
+__int256_t global_s;
+
+// CHECK-DAG: @global_u = global i256 42, align 16
+__uint256_t global_u = 42;
+
+// CHECK-DAG: @static_s = internal global i256 0, align 16
+static __int256_t static_s;
+
+// CHECK-DAG: @extern_s = external global i256, align 16
+extern __int256_t extern_s;
+
+// CHECK-LABEL: define{{.*}} void @read_global(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @global_s, align 16
+__int256_t read_global(void) { return global_s; }
+
+// CHECK-LABEL: define{{.*}} void @write_global(ptr{{.*}}byval(i256) align 16
+// CHECK: store i256 %{{.*}}, ptr @global_s, align 16
+void write_global(__int256_t v) { global_s = v; }
+
+// CHECK-LABEL: define{{.*}} void @read_static(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @static_s, align 16
+__int256_t read_static(void) { return static_s; }
+
+// CHECK-LABEL: define{{.*}} void @write_static(ptr{{.*}}byval(i256) align 16
+// CHECK: store i256 %{{.*}}, ptr @static_s, align 16
+void write_static(__int256_t v) { static_s = v; }
+
+// CHECK-LABEL: define{{.*}} void @read_extern(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @extern_s, align 16
+__int256_t read_extern(void) { return extern_s; }
+
+// CHECK-LABEL: define{{.*}} void @read_global_u(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @global_u, align 16
+__uint256_t read_global_u(void) { return global_u; }
diff --git a/clang/test/CodeGen/overflow-builtins-int256.c b/clang/test/CodeGen/overflow-builtins-int256.c
new file mode 100644
index 0000000000000..86795d4ddcc34
--- /dev/null
+++ b/clang/test/CodeGen/overflow-builtins-int256.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Test overflow builtins with __int256_t and __uint256_t.
+
+void overflowed(void);
+
+// CHECK-LABEL: define {{.*}}@test_sadd_overflow_int256
+// CHECK: call { i256, i1 } @llvm.sadd.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_sadd_overflow_int256(__int256_t x, __int256_t y) {
+  __int256_t r;
+  if (__builtin_add_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_uadd_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.uadd.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_uadd_overflow_uint256(__uint256_t x, __uint256_t y) {
+  __uint256_t r;
+  if (__builtin_add_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_ssub_overflow_int256
+// CHECK: call { i256, i1 } @llvm.ssub.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_ssub_overflow_int256(__int256_t x, __int256_t y) {
+  __int256_t r;
+  if (__builtin_sub_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_usub_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.usub.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_usub_overflow_uint256(__uint256_t x, __uint256_t y) {
+  __uint256_t r;
+  if (__builtin_sub_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_smul_overflow_int256
+// CHECK: call { i256, i1 } @llvm.smul.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_smul_overflow_int256(__int256_t x, __int256_t y) {
+  __int256_t r;
+  if (__builtin_mul_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_umul_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.umul.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_umul_overflow_uint256(__uint256_t x, __uint256_t y) {
+  __uint256_t r;
+  if (__builtin_mul_overflow(x, y, &r))
+    overflowed();
+  return (int)r;
+}
diff --git a/clang/test/CodeGen/uint256_t.c b/clang/test/CodeGen/uint256_t.c
new file mode 100644
index 0000000000000..30ceb6a785f18
--- /dev/null
+++ b/clang/test/CodeGen/uint256_t.c
@@ -0,0 +1,239 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-apple-darwin9 | FileCheck %s
+
+// Basic arithmetic code generation for __uint256_t / __int256_t.
+// Verifies that all operations lower to i256 LLVM IR.
+// On x86-64, __int256 is passed/returned via byval/sret (Memory class).
+
+// CHECK-LABEL: define{{.*}} void @add256(ptr{{.*}}sret(i256)
+// CHECK: add nsw i256
+__int256_t add256(__int256_t a, __int256_t b) { return a + b; }
+
+// CHECK-LABEL: define{{.*}} void @sub256(ptr{{.*}}sret(i256)
+// CHECK: sub nsw i256
+__int256_t sub256(__int256_t a, __int256_t b) { return a - b; }
+
+// CHECK-LABEL: define{{.*}} void @mul256(ptr{{.*}}sret(i256)
+// CHECK: mul i256
+__uint256_t mul256(__uint256_t a, __uint256_t b) { return a * b; }
+
+// CHECK-LABEL: define{{.*}} void @div256(ptr{{.*}}sret(i256)
+// CHECK: udiv i256
+__uint256_t div256(__uint256_t a, __uint256_t b) { return a / b; }
+
+// CHECK-LABEL: define{{.*}} void @sdiv256(ptr{{.*}}sret(i256)
+// CHECK: sdiv i256
+__int256_t sdiv256(__int256_t a, __int256_t b) { return a / b; }
+
+// Bitwise operations -- core of Hamming distance / popcount patterns
+// CHECK-LABEL: define{{.*}} void @xor256(ptr{{.*}}sret(i256)
+// CHECK: xor i256
+__uint256_t xor256(__uint256_t a, __uint256_t b) { return a ^ b; }
+
+// CHECK-LABEL: define{{.*}} void @and256(ptr{{.*}}sret(i256)
+// CHECK: and i256
+__uint256_t and256(__uint256_t a, __uint256_t b) { return a & b; }
+
+// CHECK-LABEL: define{{.*}} void @or256(ptr{{.*}}sret(i256)
+// CHECK: or i256
+__uint256_t or256(__uint256_t a, __uint256_t b) { return a | b; }
+
+// CHECK-LABEL: define{{.*}} void @not256(ptr{{.*}}sret(i256)
+// CHECK: xor i256 %{{.*}}, -1
+__uint256_t not256(__uint256_t a) { return ~a; }
+
+// CHECK-LABEL: define{{.*}} void @shl256(ptr{{.*}}sret(i256)
+// CHECK: shl i256
+__uint256_t shl256(__uint256_t a, __uint256_t b) { return a << b; }
+
+// CHECK-LABEL: define{{.*}} void @lshr256(ptr{{.*}}sret(i256)
+// CHECK: lshr i256
+__uint256_t lshr256(__uint256_t a, __uint256_t b) { return a >> b; }
+
+// CHECK-LABEL: define{{.*}} void @ashr256(ptr{{.*}}sret(i256)
+// CHECK: ashr i256
+__int256_t ashr256(__int256_t a, __int256_t b) { return a >> b; }
+
+// Widening conversion from uint64_t
+// CHECK-LABEL: define{{.*}} void @widen(ptr{{.*}}sret(i256){{.*}}, i64
+// CHECK: zext i64 %{{.*}} to i256
+__uint256_t widen(unsigned long long x) { return (__uint256_t)x; }
+
+// Narrowing conversion to uint64_t
+// CHECK-LABEL: define{{.*}} i64 @narrow(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+unsigned long long narrow(__uint256_t x) { return (unsigned long long)x; }
+
+// Conversion between i128 and i256
+// CHECK-LABEL: define{{.*}} void @from128(ptr{{.*}}sret(i256){{.*}}, i128
+// CHECK: sext i128 %{{.*}} to i256
+__int256_t from128(__int128_t x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} i128 @to128(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i128
+__int128_t to128(__int256_t x) { return (__int128_t)x; }
+
+// Comparison
+// CHECK-LABEL: define{{.*}} i32 @cmp_eq(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp eq i256
+int cmp_eq(__int256_t a, __int256_t b) { return a == b; }
+
+// CHECK-LABEL: define{{.*}} i32 @cmp_slt(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp slt i256
+int cmp_slt(__int256_t a, __int256_t b) { return a < b; }
+
+// CHECK-LABEL: define{{.*}} i32 @cmp_ult(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp ult i256
+int cmp_ult(__uint256_t a, __uint256_t b) { return a < b; }
+
+// Unsigned remainder
+// CHECK-LABEL: define{{.*}} void @urem256(ptr{{.*}}sret(i256)
+// CHECK: urem i256
+__uint256_t urem256(__uint256_t a, __uint256_t b) { return a % b; }
+
+// Signed remainder
+// CHECK-LABEL: define{{.*}} void @srem256(ptr{{.*}}sret(i256)
+// CHECK: srem i256
+__int256_t srem256(__int256_t a, __int256_t b) { return a % b; }
+
+// Unary minus
+// CHECK-LABEL: define{{.*}} void @neg256(ptr{{.*}}sret(i256)
+// CHECK: sub nsw i256 0,
+__int256_t neg256(__int256_t a) { return -a; }
+
+// Bool conversion
+// CHECK-LABEL: define{{.*}} i32 @bool256(ptr{{.*}}byval(i256)
+// CHECK: icmp ne i256 %{{.*}}, 0
+int bool256(__uint256_t a) { return !!a; }
+
+// ===----------------------------------------------------------------------===
+// Comprehensive cast / conversion tests
+// ===----------------------------------------------------------------------===
+
+// --- Widening: signed small -> signed i256 (sign-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_schar(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__int256_t widen_schar(signed char x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_short(ptr{{.*}}sret(i256)
+// CHECK: sext i16 %{{.*}} to i256
+__int256_t widen_short(short x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_int(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__int256_t widen_int(int x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_long(ptr{{.*}}sret(i256)
+// CHECK: sext i64 %{{.*}} to i256
+__int256_t widen_long(long long x) { return (__int256_t)x; }
+
+// --- Widening: unsigned small -> unsigned i256 (zero-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_uchar(ptr{{.*}}sret(i256)
+// CHECK: zext i8 %{{.*}} to i256
+__uint256_t widen_uchar(unsigned char x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_ushort(ptr{{.*}}sret(i256)
+// CHECK: zext i16 %{{.*}} to i256
+__uint256_t widen_ushort(unsigned short x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_uint(ptr{{.*}}sret(i256)
+// CHECK: zext i32 %{{.*}} to i256
+__uint256_t widen_uint(unsigned int x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_ulong(ptr{{.*}}sret(i256)
+// CHECK: zext i64 %{{.*}} to i256
+__uint256_t widen_ulong(unsigned long long x) { return (__uint256_t)x; }
+
+// --- Widening: unsigned i128 -> unsigned i256 (zero-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_u128(ptr{{.*}}sret(i256)
+// CHECK: zext i128 %{{.*}} to i256
+__uint256_t widen_u128(__uint128_t x) { return (__uint256_t)x; }
+
+// --- Widening: signed small -> unsigned i256 (sign-extend then implicit) ---
+// C semantics: cast to __int256_t first (sext), then to __uint256_t (nop).
+// The compiler folds this to sext directly to i256.
+
+// CHECK-LABEL: define{{.*}} void @widen_schar_to_u256(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__uint256_t widen_schar_to_u256(signed char x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_int_to_u256(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__uint256_t widen_int_to_u256(int x) { return (__uint256_t)x; }
+
+// --- Narrowing: i256 -> small types (truncate) ---
+
+// CHECK-LABEL: define{{.*}} signext i8 @narrow_to_schar(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i8
+signed char narrow_to_schar(__int256_t x) { return (signed char)x; }
+
+// CHECK-LABEL: define{{.*}} zeroext i8 @narrow_to_uchar(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i8
+unsigned char narrow_to_uchar(__uint256_t x) { return (unsigned char)x; }
+
+// CHECK-LABEL: define{{.*}} signext i16 @narrow_to_short(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i16
+short narrow_to_short(__int256_t x) { return (short)x; }
+
+// CHECK-LABEL: define{{.*}} zeroext i16 @narrow_to_ushort(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i16
+unsigned short narrow_to_ushort(__uint256_t x) { return (unsigned short)x; }
+
+// CHECK-LABEL: define{{.*}} i32 @narrow_to_int(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+int narrow_to_int(__int256_t x) { return (int)x; }
+
+// CHECK-LABEL: define{{.*}} i32 @narrow_to_uint(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+unsigned int narrow_to_uint(__uint256_t x) { return (unsigned int)x; }
+
+// CHECK-LABEL: define{{.*}} i64 @narrow_to_long(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+long long narrow_to_long(__int256_t x) { return (long long)x; }
+
+// CHECK-LABEL: define{{.*}} i64 @narrow_to_ulong(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+unsigned long long narrow_to_ulong(__uint256_t x) {
+  return (unsigned long long)x;
+}
+
+// --- Narrowing: i256 -> i128 (unsigned) ---
+
+// CHECK-LABEL: define{{.*}} i128 @narrow_to_u128(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i128
+__uint128_t narrow_to_u128(__uint256_t x) { return (__uint128_t)x; }
+
+// --- Cross-sign: signed <-> unsigned i256 (no-op, same bit pattern) ---
+
+// CHECK-LABEL: define{{.*}} void @signed_to_unsigned(ptr{{.*}}sret(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK-NOT: ext
+// CHECK-NOT: trunc
+// CHECK: ret void
+__uint256_t signed_to_unsigned(__int256_t x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @unsigned_to_signed(ptr{{.*}}sret(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK-NOT: ext
+// CHECK-NOT: trunc
+// CHECK: ret void
+__int256_t unsigned_to_signed(__uint256_t x) { return (__int256_t)x; }
+
+// --- Multi-step: negative char -> signed i256 (sign-extension across
+// 248 bits) ---
+// This verifies that (int256_t)(char)-42 produces a 256-bit -42
+// via sign-extension, not a large positive number.
+
+// CHECK-LABEL: define{{.*}} void @neg_char_to_i256(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__int256_t neg_char_to_i256(signed char x) { return x; }
+
+// --- Implicit conversions (no explicit cast) ---
+
+// CHECK-LABEL: define{{.*}} void @implicit_int_to_i256(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__int256_t implicit_int_to_i256(int x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i32 @implicit_i256_to_int(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+int implicit_i256_to_int(__int256_t x) { return x; }
diff --git a/clang/test/CodeGen/varargs-int256.c b/clang/test/CodeGen/varargs-int256.c
new file mode 100644
index 0000000000000..22e61d22d598e
--- /dev/null
+++ b/clang/test/CodeGen/varargs-int256.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=AARCH64
+
+// Test that __int256 works correctly with variadic functions (va_arg).
+
+typedef __builtin_va_list va_list;
+
+// x86_64: return via sret (Memory class per SysV ABI)
+// X86-LABEL: define{{.*}} void @va_int256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, i32 noundef %n, ...)
+// X86: load i256, ptr %{{.*}}, align 16
+
+// AArch64: return directly (4 GPRs)
+// AARCH64-LABEL: define{{.*}} i256 @va_int256(i32 noundef %n, ...)
+// AARCH64: load i256, ptr %{{.*}}, align
+__int256 va_int256(int n, ...) {
+  va_list ap;
+  __builtin_va_start(ap, n);
+  __int256 v = __builtin_va_arg(ap, __int256);
+  __builtin_va_end(ap);
+  return v;
+}
+
+// Test passing __int256 to a variadic function call.
+void callee(int, ...);
+
+// x86_64: __int256 passed via byval pointer
+// X86-LABEL: define{{.*}} void @pass_int256(ptr noundef byval(i256) align 16 %0)
+// X86: call void (i32, ...) @callee(i32 noundef 1, ptr noundef byval(i256) align 16 %
+
+// AArch64: __int256 passed directly
+// AARCH64-LABEL: define{{.*}} void @pass_int256(i256 noundef %x)
+// AARCH64: call void (i32, ...) @callee(i32 noundef 1, i256 noundef %
+void pass_int256(__int256 x) {
+  callee(1, x);
+}
+
+// Multiple va_arg fetches of __int256
+// X86-LABEL: define{{.*}} void @va_two(ptr{{.*}}sret(i256){{.*}}, i32 noundef %n, ...)
+// X86: load i256, ptr %{{.*}}, align 16
+// X86: load i256, ptr %{{.*}}, align 16
+// X86: add nsw i256
+
+// AARCH64-LABEL: define{{.*}} i256 @va_two(i32 noundef %n, ...)
+// AARCH64: load i256
+// AARCH64: load i256
+// AARCH64: add nsw i256
+__int256 va_two(int n, ...) {
+  va_list ap;
+  __builtin_va_start(ap, n);
+  __int256 a = __builtin_va_arg(ap, __int256);
+  __int256 b = __builtin_va_arg(ap, __int256);
+  __builtin_va_end(ap);
+  return a + b;
+}
+
+// Mixed sizes in varargs: int, __int256, long long
+// X86-LABEL: define{{.*}} i64 @va_mixed(i32 noundef %n, ...)
+// AARCH64-LABEL: define{{.*}} i64 @va_mixed(i32 noundef %n, ...)
+long long va_mixed(int n, ...) {
+  va_list ap;
+  __builtin_va_start(ap, n);
+  int x = __builtin_va_arg(ap, int);
+  __int256 big = __builtin_va_arg(ap, __int256);
+  long long y = __builtin_va_arg(ap, long long);
+  __builtin_va_end(ap);
+  return x + (long long)big + y;
+}
diff --git a/clang/test/CodeGenCXX/mangle-int256.cpp b/clang/test/CodeGenCXX/mangle-int256.cpp
new file mode 100644
index 0000000000000..758cbf9e47619
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-int256.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s --check-prefix=ITANIUM
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc | FileCheck %s --check-prefix=MS
+
+// Verify Itanium C++ name mangling for __int256_t / __uint256_t.
+// These use vendor-extended type mangling since there are no standard
+// single-letter codes for 256-bit integers (unlike 'n'/'o' for 128-bit).
+
+// Verify Microsoft C++ name mangling for __int256_t / __uint256_t.
+// These use $$_L / $$_M (extending the _L / _M pattern for __int128).
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f01u7__int256u8__uint256
+// MS-LABEL: define{{.*}} void @"?f01@@YAX$$_L$$_M at Z"
+void f01(__int256_t, __uint256_t) {}
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f02no
+// MS-LABEL: define{{.*}} void @"?f02@@YAX_L_M at Z"
+void f02(__int128_t, __uint128_t) {}
+
+// Overloading: __int256_t vs __int128_t should produce different manglings
+// ITANIUM-LABEL: define{{.*}} void @_Z3f03n
+// MS-LABEL: define{{.*}} void @"?f03@@YAX_L at Z"
+void f03(__int128_t) {}
+// ITANIUM-LABEL: define{{.*}} void @_Z3f03u7__int256
+// MS-LABEL: define{{.*}} void @"?f03@@YAX$$_L at Z"
+void f03(__int256_t) {}
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f04o
+// MS-LABEL: define{{.*}} void @"?f04@@YAX_M at Z"
+void f04(__uint128_t) {}
+// ITANIUM-LABEL: define{{.*}} void @_Z3f04u8__uint256
+// MS-LABEL: define{{.*}} void @"?f04@@YAX$$_M at Z"
+void f04(__uint256_t) {}
diff --git a/clang/test/Modules/decl-params-determinisim.m b/clang/test/Modules/decl-params-determinisim.m
index db4ed33265388..cddad068837b8 100644
--- a/clang/test/Modules/decl-params-determinisim.m
+++ b/clang/test/Modules/decl-params-determinisim.m
@@ -28,23 +28,23 @@
 
 // CHECK: <TYPE_FUNCTION_PROTO
 // CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op5=13
-// CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op5=14
-// CHECK-NEXT: <DECL_PARM_VAR
 // CHECK-SAME: op5=15
 // CHECK-NEXT: <DECL_PARM_VAR
 // CHECK-SAME: op5=16
+// CHECK-NEXT: <DECL_PARM_VAR
+// CHECK-SAME: op5=17
+// CHECK-NEXT: <DECL_PARM_VAR
+// CHECK-SAME: op5=18
 
 /// Decl records start at 43
 // CHECK: <DECL_RECORD
-// CHECK-SAME: op5=54
-// CHECK-NEXT: <DECL_RECORD
-// CHECK-SAME: op5=55
-// CHECK-NEXT: <DECL_RECORD
 // CHECK-SAME: op5=56
 // CHECK-NEXT: <DECL_RECORD
 // CHECK-SAME: op5=57
+// CHECK-NEXT: <DECL_RECORD
+// CHECK-SAME: op5=58
+// CHECK-NEXT: <DECL_RECORD
+// CHECK-SAME: op5=59
 
 //--- headers/a.h
 void f(struct A0 *a0,
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 09e3fc926a309..77c1cb23a56e1 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -267,6 +267,7 @@
 // AARCH64-NEXT: #define __SIZEOF_DOUBLE__ 8
 // AARCH64-NEXT: #define __SIZEOF_FLOAT__ 4
 // AARCH64-NEXT: #define __SIZEOF_INT128__ 16
+// AARCH64-NEXT: #define __SIZEOF_INT256__ 32
 // AARCH64-NEXT: #define __SIZEOF_INT__ 4
 // AARCH64-NEXT: #define __SIZEOF_LONG_DOUBLE__ 16
 // AARCH64-NEXT: #define __SIZEOF_LONG_LONG__ 8
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 80b7a6399e5f4..912226a7906b6 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1939,6 +1939,7 @@
 // WEBASSEMBLY-NEXT:#define __SIZEOF_DOUBLE__ 8
 // WEBASSEMBLY-NEXT:#define __SIZEOF_FLOAT__ 4
 // WEBASSEMBLY-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT256__ 32
 // WEBASSEMBLY-NEXT:#define __SIZEOF_INT__ 4
 // WEBASSEMBLY-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
 // WEBASSEMBLY-NEXT:#define __SIZEOF_LONG_LONG__ 8
diff --git a/clang/test/Sema/256bitint.c b/clang/test/Sema/256bitint.c
new file mode 100644
index 0000000000000..b63c193f438be
--- /dev/null
+++ b/clang/test/Sema/256bitint.c
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-apple-darwin9 %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-gnu %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple arm-linux-gnueabi %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple powerpc64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple riscv64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple wasm32-unknown-unknown %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple wasm64-unknown-unknown %s -DHAVE
+
+#ifdef HAVE
+// expected-no-diagnostics
+
+// __int256 is supported on all 64-bit targets
+
+__int256_t b256s = (__int256_t)0;
+__uint256_t b256u = (__uint256_t)-1;
+
+// Explicit signed/unsigned qualifiers
+__int256 i256 = (__int256)0;
+signed __int256 si256 = (signed __int256)0;
+unsigned __int256 ui256 = (unsigned __int256)-1;
+
+// sizeof / alignof
+int sz[sizeof(__int256_t) == 32 ? 1 : -1];
+int al[_Alignof(__int256_t) == 16 ? 1 : -1];
+int sz2[sizeof(__uint256_t) == 32 ? 1 : -1];
+int al2[_Alignof(__uint256_t) == 16 ? 1 : -1];
+
+// __SIZEOF_INT256__ predefined macro
+int sizemacro[__SIZEOF_INT256__ == 32 ? 1 : -1];
+
+// Basic arithmetic
+__int256_t arith_add(__int256_t a, __int256_t b) { return a + b; }
+__int256_t arith_sub(__int256_t a, __int256_t b) { return a - b; }
+__int256_t arith_mul(__int256_t a, __int256_t b) { return a * b; }
+__int256_t arith_div(__int256_t a, __int256_t b) { return a / b; }
+__int256_t arith_rem(__int256_t a, __int256_t b) { return a % b; }
+
+// Bitwise operations (key for Hamming distance / popcount use cases)
+__uint256_t bit_and(__uint256_t a, __uint256_t b) { return a & b; }
+__uint256_t bit_or(__uint256_t a, __uint256_t b) { return a | b; }
+__uint256_t bit_xor(__uint256_t a, __uint256_t b) { return a ^ b; }
+__uint256_t bit_not(__uint256_t a) { return ~a; }
+__uint256_t bit_shl(__uint256_t a, __uint256_t b) { return a << b; }
+__uint256_t bit_shr(__uint256_t a, __uint256_t b) { return a >> b; }
+
+// Comparisons
+int cmp_eq(__int256_t a, __int256_t b) { return a == b; }
+int cmp_lt(__int256_t a, __int256_t b) { return a < b; }
+int cmp_gt(__int256_t a, __int256_t b) { return a > b; }
+
+// Conversions between int256 and int128
+__int256_t from128(__int128_t x) { return (__int256_t)x; }
+__int128_t to128(__int256_t x) { return (__int128_t)x; }
+
+// Conversion from smaller types
+__int256_t from64(long long x) { return (__int256_t)x; }
+__uint256_t fromu64(unsigned long long x) { return (__uint256_t)x; }
+
+// Typedef equivalence
+typedef __int256_t MyInt256;
+MyInt256 typedef_test(MyInt256 a) { return a; }
+
+#else
+
+__int256 n; // expected-error {{__int256 is not supported on this target}}
+
+#if defined(__SIZEOF_INT256__)
+#error __SIZEOF_INT256__ should not be defined
+#endif
+
+#endif
diff --git a/clang/test/Sema/atomic-builtins-int256.c b/clang/test/Sema/atomic-builtins-int256.c
new file mode 100644
index 0000000000000..4fbb0fffcb5f4
--- /dev/null
+++ b/clang/test/Sema/atomic-builtins-int256.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu %s
+
+// Verify that __sync_* builtins reject __int256 (max atomic width is 16 bytes).
+// The __c11_atomic_* builtins accept __int256 (via libcalls) and are tested
+// separately in atomic-int256.c and CodeGen/X86/x86_64-atomic-i256.c.
+
+__int256 test_sync_add(__int256 *addr, __int256 val) {
+  return __sync_fetch_and_add(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_sub(__int256 *addr, __int256 val) {
+  return __sync_fetch_and_sub(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_or(__int256 *addr, __int256 val) {
+  return __sync_fetch_and_or(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_and(__int256 *addr, __int256 val) {
+  return __sync_fetch_and_and(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_xor(__int256 *addr, __int256 val) {
+  return __sync_fetch_and_xor(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+_Bool test_sync_cas(__int256 *addr, __int256 oldval, __int256 newval) {
+  return __sync_bool_compare_and_swap(addr, oldval, newval); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
diff --git a/clang/test/Sema/atomic-int256.c b/clang/test/Sema/atomic-int256.c
new file mode 100644
index 0000000000000..6257338e50ad4
--- /dev/null
+++ b/clang/test/Sema/atomic-int256.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu %s
+// expected-no-diagnostics
+
+// __int256 is never lock-free (256 bits > max atomic width on any current target)
+_Static_assert(!__atomic_always_lock_free(32, 0), "__int256 should not be always lock-free");
+
+// _Atomic __int256_t variables should compile
+_Atomic __int256_t atomic_s256;
+_Atomic __uint256_t atomic_u256;
+
+// Atomic load/store should compile (will use libcalls)
+__int256_t load_atomic(void) {
+  return __c11_atomic_load(&atomic_s256, __ATOMIC_SEQ_CST);
+}
+
+void store_atomic(__int256_t val) {
+  __c11_atomic_store(&atomic_s256, val, __ATOMIC_SEQ_CST);
+}
+
+__uint256_t load_atomic_unsigned(void) {
+  return __c11_atomic_load(&atomic_u256, __ATOMIC_SEQ_CST);
+}
+
+void store_atomic_unsigned(__uint256_t val) {
+  __c11_atomic_store(&atomic_u256, val, __ATOMIC_SEQ_CST);
+}
diff --git a/clang/test/Sema/bitfield-int256.c b/clang/test/Sema/bitfield-int256.c
new file mode 100644
index 0000000000000..89a0bdc242668
--- /dev/null
+++ b/clang/test/Sema/bitfield-int256.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-gnu %s
+
+// Test __int256 bitfield support.
+
+struct S1 {
+  __int256 x : 200;
+  __int256 y : 56;
+};
+
+_Static_assert(sizeof(struct S1) == 32, "S1 should be 32 bytes");
+
+struct S2 {
+  char a;
+  __int256 x : 100;
+};
+
+struct S3 {
+  unsigned __int256 x : 256; // Full width bitfield
+};
+
+_Static_assert(sizeof(struct S3) == 32, "S3 should be 32 bytes");
+
+struct S4 {
+  __int256 x : 1; // Single bit signed
+  unsigned __int256 y : 1; // Single bit unsigned
+};
+
+// Test bitfield access
+int test_bitfield(void) {
+  struct S1 s = {};
+  s.x = 42;
+  s.y = -1;
+  return (int)s.x + (int)s.y;
+}
+
+// Test zero-width bitfield
+struct S5 {
+  __int256 : 0; // Zero-width bitfield for alignment
+  int x;
+};
+
+// expected-no-diagnostics
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index 53face901d75e..9be6cd73b4355 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -143,6 +143,11 @@ void *PR28739b = &PR28739b + (__int128)(unsigned long)-1;                  // ex
 __int128 PR28739c = (&PR28739c + (__int128)(unsigned long)-1) - &PR28739c; // expected-warning {{refers past the last possible element}}
 void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1];                // expected-warning {{refers past the last possible element}}
 
+#ifdef __SIZEOF_INT256__
+// __int256 pointer arithmetic -- same pattern as __int128 above.
+__int256 PR28739_256 = (&PR28739_256 + (__int256)(unsigned long)-1) - &PR28739_256; // expected-warning {{refers past the last possible element}}
+#endif
+
 struct PR35214_X {
   int k;
   int arr[];
diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c
index bb005981b6daf..20dbd6c584cf3 100644
--- a/clang/test/Sema/constant-builtins-2.c
+++ b/clang/test/Sema/constant-builtins-2.c
@@ -315,6 +315,12 @@ char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ?
 char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 #endif
+#ifdef __SIZEOF_INT256__
+int clz256_0 = __builtin_clzg((unsigned __int256)0); // expected-error {{not a compile-time constant}}
+char clz256_1[__builtin_clzg((unsigned __int256)0, 42) == 42 ? 1 : -1];
+char clz256_2[__builtin_clzg((unsigned __int256)0x1) == BITSIZE(__int256) - 1 ? 1 : -1];
+char clz256_3[__builtin_clzg((unsigned __int256)0xf) == BITSIZE(__int256) - 4 ? 1 : -1];
+#endif
 int clz58 = __builtin_clzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
 char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
 char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -381,6 +387,12 @@ char ctz53[__builtin_ctzg((unsigned __int128)0x10, 42) == 4 ? 1 : -1];
 char ctz54[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1)) == BITSIZE(__int128) - 1 ? 1 : -1];
 char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
+#ifdef __SIZEOF_INT256__
+int ctz256_0 = __builtin_ctzg((unsigned __int256)0); // expected-error {{not a compile-time constant}}
+char ctz256_1[__builtin_ctzg((unsigned __int256)0, 42) == 42 ? 1 : -1];
+char ctz256_2[__builtin_ctzg((unsigned __int256)0x1) == 0 ? 1 : -1];
+char ctz256_3[__builtin_ctzg((unsigned __int256)0x10) == 4 ? 1 : -1];
+#endif
 int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
 char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
 char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
@@ -408,6 +420,9 @@ char popcount15[__builtin_popcountg(~0ULL) == BITSIZE(long long) ? 1 : -1];
 #ifdef __SIZEOF_INT128__
 char popcount16[__builtin_popcountg(~(unsigned __int128)0) == BITSIZE(__int128) ? 1 : -1];
 #endif
+#ifdef __SIZEOF_INT256__
+char popcount256[__builtin_popcountg(~(unsigned __int256)0) == BITSIZE(__int256) ? 1 : -1];
+#endif
 char popcount17[__builtin_popcountg(~(unsigned _BitInt(128))0) == BITSIZE(_BitInt(128)) ? 1 : -1];
 
 char parity1[__builtin_parity(0) == 0 ? 1 : -1];
diff --git a/clang/test/Sema/enum.c b/clang/test/Sema/enum.c
index f0da5f097fa80..9f9e4d9baeabc 100644
--- a/clang/test/Sema/enum.c
+++ b/clang/test/Sema/enum.c
@@ -206,7 +206,9 @@ _Static_assert(
     long long : 0,
     unsigned long long : 0,
     __int128_t : 0,
-    __uint128_t : 1
+    __uint128_t : 1,
+    __int256_t : 0,
+    __uint256_t : 0
     )
 );
 
diff --git a/clang/test/Sema/struct-layout-int256.c b/clang/test/Sema/struct-layout-int256.c
new file mode 100644
index 0000000000000..6f56ee921a086
--- /dev/null
+++ b/clang/test/Sema/struct-layout-int256.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-gnu %s
+
+// Test struct layout, alignment, and padding with __int256.
+
+// Basic alignment and size
+_Static_assert(sizeof(__int256) == 32, "");
+_Static_assert(_Alignof(__int256) == 16, "");
+_Static_assert(sizeof(unsigned __int256) == 32, "");
+
+// Struct with __int256 member
+struct Basic {
+  __int256 x;
+};
+_Static_assert(sizeof(struct Basic) == 32, "");
+_Static_assert(_Alignof(struct Basic) == 16, "");
+
+// Struct with padding before __int256
+struct Padded {
+  char a;
+  __int256 x;
+};
+// 'a' at offset 0 (1 byte), 15 bytes padding, 'x' at offset 16
+_Static_assert(sizeof(struct Padded) == 48, "");
+_Static_assert(_Alignof(struct Padded) == 16, "");
+
+// Struct with multiple __int256 members
+struct Multi {
+  __int256 x;
+  __int256 y;
+};
+_Static_assert(sizeof(struct Multi) == 64, "");
+
+// Nested struct
+struct Nested {
+  struct Basic inner;
+  int z;
+};
+_Static_assert(sizeof(struct Nested) == 48, ""); // 32 + 4 + 12 padding
+
+// Union with __int256
+union U {
+  __int256 x;
+  char bytes[32];
+  long long parts[4];
+};
+_Static_assert(sizeof(union U) == 32, "");
+_Static_assert(_Alignof(union U) == 16, "");
+
+// Array of __int256
+struct ArrayMember {
+  __int256 arr[2];
+};
+_Static_assert(sizeof(struct ArrayMember) == 64, "");
+
+// Packed struct
+struct __attribute__((packed)) Packed {
+  char a;
+  __int256 x;
+};
+_Static_assert(sizeof(struct Packed) == 33, "");
+_Static_assert(_Alignof(struct Packed) == 1, "");
+
+// Aligned struct override
+struct __attribute__((aligned(64))) OverAligned {
+  __int256 x;
+};
+_Static_assert(sizeof(struct OverAligned) == 64, "");
+_Static_assert(_Alignof(struct OverAligned) == 64, "");
+
+// expected-no-diagnostics
diff --git a/clang/test/Sema/tautological-constant-compare.c b/clang/test/Sema/tautological-constant-compare.c
index 04b8a1416be0b..561979a3665a7 100644
--- a/clang/test/Sema/tautological-constant-compare.c
+++ b/clang/test/Sema/tautological-constant-compare.c
@@ -486,6 +486,11 @@ int main(void)
   if (i128 == -1) // used to crash
       return 0;
 #endif
+#if __SIZEOF_INT256__
+  __int256 i256 = value();
+  if (i256 == -1) // mirrors __int128 test above
+      return 0;
+#endif
 
 
   enum E {
diff --git a/clang/test/Sema/types.c b/clang/test/Sema/types.c
index 2be0e6544f3d7..baae91f61bc2e 100644
--- a/clang/test/Sema/types.c
+++ b/clang/test/Sema/types.c
@@ -39,6 +39,31 @@ typedef unsigned __int128 check_uint_128;
 typedef __uint128_t check_uint_128; // expected-note {{here}}
 typedef int check_uint_128; // expected-error {{different types ('int' vs '__uint128_t' (aka 'unsigned __int128'))}}
 
+#ifdef __SIZEOF_INT256__
+// __int256_t / __uint256_t are available (mirrors __int128_t tests above).
+void a256(void) {
+  __int256_t s;
+  __uint256_t t;
+}
+
+// __int256 is a keyword
+int c256(void) {
+  __int256 i;
+  unsigned __int256 j;
+  long unsigned __int256 k; // expected-error {{'long __int256' is invalid}}
+  int __int256; // expected-error {{cannot combine with previous}} expected-warning {{does not declare anything}}
+}
+
+// __int256_t is __int256; __uint256_t is unsigned __int256.
+typedef __int256 check_int_256;
+typedef __int256_t check_int_256; // expected-note {{here}}
+typedef int check_int_256; // expected-error {{different types ('int' vs '__int256_t' (aka '__int256'))}}
+
+typedef unsigned __int256 check_uint_256;
+typedef __uint256_t check_uint_256; // expected-note {{here}}
+typedef int check_uint_256; // expected-error {{different types ('int' vs '__uint256_t' (aka 'unsigned __int256'))}}
+#endif
+
 // Array type merging should convert array size to whatever matches the target
 // pointer size.
 extern int i[1LL];
diff --git a/clang/test/SemaCUDA/int256.cu b/clang/test/SemaCUDA/int256.cu
new file mode 100644
index 0000000000000..ece1e099e0d5c
--- /dev/null
+++ b/clang/test/SemaCUDA/int256.cu
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   -fcuda-is-device -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   -fcuda-is-device -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   -fcuda-is-device -verify -fsyntax-only %s
+
+// Verify that __int256 is allowed in CUDA device code when the host target
+// supports it, matching the __int128 behavior (see allow-int128.cu).
+// In CUDA mode, the host type system is shared with the device — type support
+// diagnostics are deferred and not emitted for CUDA device compilations.
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__int256 h_glb;
+__device__ __int256 d_glb;
+
+__device__ __int256 bar() {
+  return d_glb;
+}
+
+__global__ void kernel() {
+  bar();
+}
diff --git a/clang/test/SemaCXX/deleted-operator.cpp b/clang/test/SemaCXX/deleted-operator.cpp
index 64b2b22e5661c..2f8c882db51b9 100644
--- a/clang/test/SemaCXX/deleted-operator.cpp
+++ b/clang/test/SemaCXX/deleted-operator.cpp
@@ -8,8 +8,8 @@ struct PR10757 {
 int PR10757f() {
   PR10757 a1;
   // FIXME: We get a ridiculous number of "built-in candidate" notes here...
-  if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 6-8 {{built-in candidate}}
-  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 1-144 {{built-in candidate}}
+  if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 6-10 {{built-in candidate}}
+  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 1-196 {{built-in candidate}}
 }
 
 struct DelOpDel {
diff --git a/clang/test/SemaCXX/int256-templates.cpp b/clang/test/SemaCXX/int256-templates.cpp
new file mode 100644
index 0000000000000..857cd13db2230
--- /dev/null
+++ b/clang/test/SemaCXX/int256-templates.cpp
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fsyntax-only -verify %s
+//
+// Test __int256 behavior with C++ templates, SFINAE, concepts, and conversions.
+//
+// This exercises advanced C++ interactions that upstream reviewers are likely
+// to probe: NTTP (non-type template parameters), SFINAE, implicit/explicit
+// conversions, constexpr template metaprogramming, and aggregate initialization.
+//
+// Uses Clang builtin type traits (__is_integral, etc.) to avoid depending on
+// standard library headers, which are not available in %clang_cc1 tests.
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -std=c++20 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Minimal enable_if for SFINAE testing without <type_traits>
+template <bool B, typename T = void> struct enable_if {};
+template <typename T> struct enable_if<true, T> { using type = T; };
+template <bool B, typename T = void> using enable_if_t = typename enable_if<B, T>::type;
+
+// Minimal is_same
+template <typename T, typename U> struct is_same { static constexpr bool value = false; };
+template <typename T> struct is_same<T, T> { static constexpr bool value = true; };
+
+// Minimal conditional
+template <bool B, typename T, typename F> struct conditional { using type = F; };
+template <typename T, typename F> struct conditional<true, T, F> { using type = T; };
+template <bool B, typename T, typename F> using conditional_t = typename conditional<B, T, F>::type;
+
+// ========================================================================
+// 1. Non-type template parameter (NTTP)
+// ========================================================================
+
+// __int256 can be used as a non-type template parameter in C++20.
+template <__int256_t V>
+struct IntConstant {
+    static constexpr __int256_t value = V;
+};
+
+static_assert(IntConstant<0>::value == 0);
+static_assert(IntConstant<42>::value == 42);
+static_assert(IntConstant<-1>::value == -1);
+
+// Large NTTP value
+static_assert(IntConstant<((__int256_t)1 << 200)>::value == ((__int256_t)1 << 200));
+
+// Unsigned NTTP
+template <__uint256_t V>
+struct UIntConstant {
+    static constexpr __uint256_t value = V;
+};
+
+static_assert(UIntConstant<0>::value == 0);
+static_assert(UIntConstant<~(__uint256_t)0>::value == ~(__uint256_t)0);
+
+// ========================================================================
+// 2. SFINAE on __is_integral
+// ========================================================================
+
+// Clang builtin __is_integral works for __int256 types.
+static_assert(__is_integral(__int256_t));
+static_assert(__is_integral(__uint256_t));
+static_assert(__is_integral(const __int256_t));
+static_assert(__is_integral(volatile __uint256_t));
+
+// SFINAE: enable_if selects the correct overload.
+template <typename T, enable_if_t<__is_integral(T)>* = nullptr>
+constexpr int classify(T) { return 1; }  // integral
+
+template <typename T, enable_if_t<__is_floating_point(T)>* = nullptr>
+constexpr int classify(T) { return 2; }  // floating
+
+static_assert(classify((__int256_t)42) == 1);
+static_assert(classify((__uint256_t)42) == 1);
+static_assert(classify(3.14) == 2);
+
+// ========================================================================
+// 3. Builtin type traits for __int256
+// ========================================================================
+
+// __is_signed / __is_unsigned
+static_assert(__is_signed(__int256_t));
+static_assert(!__is_unsigned(__int256_t));
+static_assert(__is_unsigned(__uint256_t));
+static_assert(!__is_signed(__uint256_t));
+
+// __is_arithmetic
+static_assert(__is_arithmetic(__int256_t));
+static_assert(__is_arithmetic(__uint256_t));
+
+// __is_fundamental
+static_assert(__is_fundamental(__int256_t));
+static_assert(__is_fundamental(__uint256_t));
+
+// __is_scalar
+static_assert(__is_scalar(__int256_t));
+static_assert(__is_scalar(__uint256_t));
+
+// __is_trivially_copyable
+static_assert(__is_trivially_copyable(__int256_t));
+static_assert(__is_trivially_copyable(__uint256_t));
+
+// __is_standard_layout
+static_assert(__is_standard_layout(__int256_t));
+static_assert(__is_standard_layout(__uint256_t));
+
+// __is_trivially_constructible
+static_assert(__is_trivially_constructible(__int256_t));
+static_assert(__is_trivially_destructible(__int256_t));
+
+// __is_constructible from various integer types
+static_assert(__is_constructible(__int256_t, int));
+static_assert(__is_constructible(__int256_t, long long));
+static_assert(__is_constructible(__int256_t, __int128_t));
+static_assert(__is_constructible(__uint256_t, unsigned));
+static_assert(__is_constructible(__uint256_t, __uint128_t));
+
+// __is_convertible (implicit conversions)
+static_assert(__is_convertible_to(int, __int256_t));
+static_assert(__is_convertible_to(__int128_t, __int256_t));
+static_assert(__is_convertible_to(__int256_t, __int128_t));
+
+// ========================================================================
+// 4. Implicit conversions: __int128 <-> __int256
+// ========================================================================
+
+// __int128 -> __int256: implicit widening (no data loss)
+constexpr __int256_t widen_s(__int128_t x) { return x; }
+constexpr __uint256_t widen_u(__uint128_t x) { return x; }
+
+static_assert(widen_s(42) == 42);
+static_assert(widen_s(-1) == -1);
+static_assert(widen_u(42) == 42);
+
+// __int256 -> __int128: implicit narrowing (may lose data)
+constexpr __int128_t narrow_s(__int256_t x) { return x; }
+constexpr __uint128_t narrow_u(__uint256_t x) { return x; }
+
+static_assert(narrow_s(42) == 42);
+static_assert(narrow_u(42) == 42);
+
+// int -> __int256: implicit widening
+constexpr __int256_t from_int(int x) { return x; }
+static_assert(from_int(42) == 42);
+static_assert(from_int(-1) == -1);
+
+// ========================================================================
+// 5. Template argument deduction
+// ========================================================================
+
+template <typename T>
+constexpr T identity(T x) { return x; }
+
+static_assert(identity((__int256_t)42) == 42);
+static_assert(identity((__uint256_t)42) == 42);
+
+// Deduction with auto
+constexpr auto auto_val = (__int256_t)100;
+static_assert(is_same<decltype(auto_val), const __int256_t>::value);
+
+// ========================================================================
+// 6. constexpr template metaprogramming
+// ========================================================================
+
+// Recursive constexpr factorial
+template <typename T>
+constexpr T factorial(T n) {
+    return n <= 1 ? T(1) : n * factorial(n - 1);
+}
+
+// 20! = 2432902008176640000 (fits in 64-bit)
+static_assert(factorial((__int256_t)20) == 2432902008176640000LL);
+
+// 34! = 295232799039604140847618609643520000000 (doesn't fit in 128-bit)
+constexpr __int256_t fact34 = factorial((__int256_t)34);
+// Verify lower 64 bits (computed from 34! mod 2^64)
+static_assert((unsigned long long)fact34 == 0x445DA75B00000000ULL);
+
+// ========================================================================
+// 7. Variadic templates
+// ========================================================================
+
+template <typename... Ts>
+constexpr auto sum(Ts... args) {
+    return (args + ...);
+}
+
+static_assert(sum((__int256_t)1, (__int256_t)2, (__int256_t)3) == 6);
+
+// ========================================================================
+// 8. Conditional type selection
+// ========================================================================
+
+static_assert(sizeof(conditional_t<true, __int256_t, __int128_t>) == 32);
+static_assert(sizeof(conditional_t<false, __int256_t, __int128_t>) == 16);
+
+// ========================================================================
+// 9. Array and aggregate initialization
+// ========================================================================
+
+struct Pair256 {
+    __int256_t first;
+    __uint256_t second;
+};
+
+constexpr Pair256 p = {42, 100};
+static_assert(p.first == 42);
+static_assert(p.second == 100);
+
+constexpr __int256_t arr[] = {1, 2, 3, 4, 5};
+static_assert(arr[0] + arr[4] == 6);
+
+// ========================================================================
+// 10. sizeof / alignof
+// ========================================================================
+
+static_assert(sizeof(__int256_t) == 32);
+static_assert(sizeof(__uint256_t) == 32);
+static_assert(alignof(__int256_t) == 16);
+static_assert(alignof(__uint256_t) == 16);
+static_assert(sizeof(__int256_t) == 2 * sizeof(__int128_t));
diff --git a/clang/test/SemaCXX/int256-type-traits.cpp b/clang/test/SemaCXX/int256-type-traits.cpp
new file mode 100644
index 0000000000000..c756532306823
--- /dev/null
+++ b/clang/test/SemaCXX/int256-type-traits.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu -std=c++20 %s
+// expected-no-diagnostics
+
+// --- Type classification traits ---
+
+static_assert(__is_integral(__int256));
+static_assert(__is_integral(unsigned __int256));
+static_assert(__is_integral(__int256_t));
+static_assert(__is_integral(__uint256_t));
+
+static_assert(__is_arithmetic(__int256));
+static_assert(__is_arithmetic(unsigned __int256));
+
+static_assert(__is_scalar(__int256));
+static_assert(__is_scalar(unsigned __int256));
+
+static_assert(__is_fundamental(__int256));
+static_assert(__is_fundamental(unsigned __int256));
+
+// --- Signedness traits ---
+
+static_assert(__is_signed(__int256));
+static_assert(!__is_unsigned(__int256));
+static_assert(!__is_signed(unsigned __int256));
+static_assert(__is_unsigned(unsigned __int256));
+
+static_assert(__is_signed(__int256_t));
+static_assert(__is_unsigned(__uint256_t));
+
+// --- __builtin_is_implicit_lifetime ---
+
+static_assert(__builtin_is_implicit_lifetime(__int256));
+static_assert(__builtin_is_implicit_lifetime(unsigned __int256));
+
+// --- __make_signed / __make_unsigned ---
+
+static_assert(__is_same(__make_signed(__int256), __int256));
+static_assert(__is_same(__make_signed(unsigned __int256), __int256));
+static_assert(__is_same(__make_unsigned(__int256), unsigned __int256));
+static_assert(__is_same(__make_unsigned(unsigned __int256), unsigned __int256));
+
+// With cv-qualifiers
+static_assert(__is_same(__make_signed(const __int256), const __int256));
+static_assert(__is_same(__make_signed(volatile unsigned __int256), volatile __int256));
+static_assert(__is_same(__make_signed(const volatile unsigned __int256), const volatile __int256));
+static_assert(__is_same(__make_unsigned(const __int256), const unsigned __int256));
+static_assert(__is_same(__make_unsigned(volatile __int256), volatile unsigned __int256));
+
+// --- Enum with __int256 underlying type ---
+
+enum E256 : __int256_t { E256_Zero = 0, E256_One = 1 };
+enum U256 : __uint256_t { U256_Zero = 0, U256_One = 1 };
+
+static_assert(__is_same(__make_signed(E256), __int256_t));
+static_assert(__is_same(__make_unsigned(E256), __uint256_t));
+static_assert(__is_same(__make_signed(U256), __int256_t));
+static_assert(__is_same(__make_unsigned(U256), __uint256_t));
+
+// --- sizeof / alignof ---
+
+static_assert(sizeof(__int256) == 32);
+static_assert(alignof(__int256) == 16);
+static_assert(sizeof(unsigned __int256) == 32);
+static_assert(alignof(unsigned __int256) == 16);
+static_assert(sizeof(__int256_t) == 32);
+static_assert(sizeof(__uint256_t) == 32);
+
+// --- Overload resolution ---
+
+constexpr int select_overload(__int128) { return 128; }
+constexpr int select_overload(__int256_t) { return 256; }
+
+static_assert(select_overload((__int256_t)0) == 256);
+static_assert(select_overload((__int128)0) == 128);
diff --git a/clang/test/SemaCXX/overloaded-builtin-operators.cpp b/clang/test/SemaCXX/overloaded-builtin-operators.cpp
index 0c76df79e6e14..7243969896181 100644
--- a/clang/test/SemaCXX/overloaded-builtin-operators.cpp
+++ b/clang/test/SemaCXX/overloaded-builtin-operators.cpp
@@ -195,7 +195,7 @@ struct A {
 
 void test_dr425(A a) {
   (void)(1.0f * a); // expected-error{{ambiguous}} \
-                    // expected-note 12{{candidate}}
+                    // expected-note 14{{candidate}}
 }
 
 // pr5432
@@ -238,7 +238,7 @@ namespace PR8477 {
     // FIXME: It would be nice to report fewer candidates here.
     (void)(foo - foo); // expected-error{{use of overloaded operator '-' is ambiguous}} \
     // expected-note 4{{built-in candidate operator-}} \
-    // expected-note{{142 candidates omitted}}
+    // expected-note{{194 candidates omitted}}
     return foo[zero] == zero;
   }
 }
diff --git a/clang/test/SemaSYCL/int256.cpp b/clang/test/SemaSYCL/int256.cpp
new file mode 100644
index 0000000000000..a1516a0c6f05c
--- /dev/null
+++ b/clang/test/SemaSYCL/int256.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple spir64 -aux-triple x86_64-unknown-linux-gnu \
+// RUN:    -fsycl-is-device -verify -fsyntax-only %s
+
+// Verify that __int256 is rejected in SYCL device code on targets
+// that don't support it, mirroring the __int128 restriction test.
+
+typedef __uint256_t BIGTY;
+
+template <class T>
+class Z {
+public:
+  // expected-note at +1 {{'field' defined here}}
+  T field;
+  // expected-note at +1 2{{'field1' defined here}}
+  __int256 field1;
+};
+
+void host_ok(void) {
+  __int256 A;
+  int B = sizeof(__int256);
+  Z<__int256> C;
+  C.field1 = A;
+}
+
+void usage() {
+  // expected-note at +1 {{'A' defined here}}
+  __int256 A;
+  Z<__int256> C;
+  // expected-error at +3 2{{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+  // expected-error at +2 {{'A' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+  // expected-error at +1 {{'field1' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+  C.field1 = A;
+}
+
+template <typename Name, typename Func>
+__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
+  // expected-note at +1 2{{called by 'kernel}}
+  kernelFunc();
+}
+
+int main() {
+  // expected-note at +1 {{'CapturedToDevice' defined here}}
+  __int256 CapturedToDevice = 1;
+  host_ok();
+  kernel<class variables>([=]() {
+    // expected-error at +1 {{'CapturedToDevice' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+    auto C = CapturedToDevice;
+    Z<__int256> S;
+    // expected-error at +2 {{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+    // expected-error at +1 {{'field1' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+    S.field1 += 1;
+    // expected-error at +2 {{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+    // expected-error at +1 {{'field' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+    S.field = 1;
+  });
+
+  kernel<class functions>([=]() {
+    // expected-note at +1 {{called by 'operator()'}}
+    usage();
+  });
+
+  kernel<class ok>([=]() {
+    Z<__int256> S;
+    auto A = sizeof(CapturedToDevice);
+  });
+
+  return 0;
+}
+
+// no error expected for host-side functions
+BIGTY zoo(BIGTY h) {
+  h = 1;
+  return h;
+}

>From bdd7361adb4234bfb935704aaeb74641a01788a8 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:46 +0100
Subject: [PATCH 04/17] [clang][docs] Add __int256 documentation and release
 notes

Document the new __int256/__uint256 builtin type in LanguageExtensions.rst
with usage examples, target availability, and comparison to _BitInt(256).
Add release note entry for Clang 22.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 clang/docs/LanguageExtensions.rst | 101 +++++++++++++++++++++++++++++-
 clang/docs/ReleaseNotes.rst       |   5 ++
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a3e487f910725..a1e058d959148 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -451,6 +451,101 @@ favor of the standard type.
 Note: the ABI for ``_BitInt(N)`` is still in the process of being stabilized,
 so this type should not yet be used in interfaces that require ABI stability.
 
+``__int256``
+------------
+
+Clang supports ``__int256`` as a builtin 256-bit integer type on targets that
+opt in (currently x86-64 and AArch64). It is the 256-bit analogue of
+``__int128`` — a first-class builtin type with full type trait integration,
+proper alignment, and register-based calling conventions.
+
+**Type spellings:**
+
+- ``__int256``, ``signed __int256``, ``__int256_t`` — signed 256-bit integer
+- ``unsigned __int256``, ``__uint256_t`` — unsigned 256-bit integer
+
+**Feature detection:**
+
+Use ``__SIZEOF_INT256__`` (defined as ``32`` when available) or
+``__is_target_feature("int256")`` for preprocessor-level detection:
+
+.. code-block:: c
+
+  #ifdef __SIZEOF_INT256__
+  // __int256 is available
+  #endif
+
+**Properties:**
+
+- Size: 32 bytes (256 bits)
+- Alignment: 32 bytes (natural alignment, matching ``__m256i``)
+- ABI: register-based on x86-64 (arguments in GPRs, return via sret)
+- Integer rank: above ``__int128`` (correct implicit conversion rules)
+
+**Supported operations:**
+
+All standard integer operations work: arithmetic (``+``, ``-``, ``*``, ``/``,
+``%``), bitwise (``&``, ``|``, ``^``, ``~``, ``<<``, ``>>``), comparisons
+(``==``, ``!=``, ``<``, ``>``, ``<=``, ``>=``), and conversions to/from other
+integer and floating-point types.
+
+**Supported builtins:**
+
+- ``__builtin_popcountg``, ``__builtin_clzg``, ``__builtin_ctzg``
+- ``__builtin_add_overflow``, ``__builtin_sub_overflow``, ``__builtin_mul_overflow``
+- ``__builtin_ffs`` (expanded inline via ``cttz``)
+
+**C++ type traits:**
+
+In C++ mode, ``__int256`` is a full integral type:
+
+- ``__is_integral(__int256)`` is ``true``
+- ``__is_arithmetic``, ``__is_scalar``, ``__is_fundamental`` are ``true``
+- ``__is_signed(__int256)`` is ``true``; ``__is_unsigned(unsigned __int256)`` is ``true``
+- ``__make_signed(unsigned __int256)`` yields ``__int256``
+- ``__make_unsigned(__int256)`` yields ``unsigned __int256``
+- Enums with ``__int256_t`` or ``__uint256_t`` as underlying type are supported
+
+**Relationship to** ``_BitInt(256)``:
+
+Both ``__int256`` and ``_BitInt(256)`` produce identical ``i256`` IR operations,
+but they differ in ABI, alignment, and type trait behavior:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Property
+     - ``__int256``
+     - ``_BitInt(256)``
+   * - Alignment
+     - 32 bytes
+     - 8 bytes
+   * - x86-64 SysV args
+     - Direct (4 GPRs)
+     - Indirect (byval)
+   * - x86-64 SysV return
+     - Indirect (sret)
+     - Indirect (sret)
+   * - AArch64 args
+     - Direct (4 GPRs: x0-x3)
+     - Indirect (byval)
+   * - AArch64 return
+     - Direct (4 GPRs: x0-x3)
+     - Indirect (sret)
+   * - Win64 ABI
+     - Indirect
+     - Indirect
+   * - ``__is_integral``
+     - ``true``
+     - ``false``
+   * - ``std::numeric_limits``
+     - Fully specialized
+     - Not specialized
+
+The ABI difference has measurable performance impact: the register-based
+calling convention avoids memory round-trips for ``__int256`` function
+arguments on x86-64 and both arguments and return values on AArch64.
+
 C keywords supported in all language modes
 ------------------------------------------
 
@@ -4561,7 +4656,8 @@ argument can be of any unsigned integer type or fixed boolean vector.
 
 ``__builtin_popcountg`` is meant to be a type-generic alternative to the
 ``__builtin_popcount{,l,ll}`` builtins, with support for other integer types,
-such as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
+such as ``unsigned __int128``, ``unsigned __int256``, and C23
+``unsigned _BitInt(N)``.
 
 ``__builtin_clzg`` and ``__builtin_ctzg``
 -----------------------------------------
@@ -4608,7 +4704,8 @@ only one argument is provided, then the behavior is undefined.
 ``__builtin_clzg`` (respectively ``__builtin_ctzg``) is meant to be a
 type-generic alternative to the ``__builtin_clz{,l,ll}`` (respectively
 ``__builtin_ctz{,l,ll}``) builtins, with support for other integer types, such
-as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
+as ``unsigned __int128``, ``unsigned __int256``, and C23
+``unsigned _BitInt(N)``.
 
 ``__builtin_counted_by_ref``
 ----------------------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 668097236fe97..066de124e0451 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -145,6 +145,11 @@ C23 Feature Support
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
+- Added ``__int256`` and ``__uint256`` as builtin extended integer types on all
+  64-bit targets, analogous to ``__int128``/``__uint128``. These types provide
+  native 256-bit integer arithmetic with compiler-rt runtime support for
+  arithmetic, division, shifts, and float conversions.
+
 - Added ``__builtin_stdc_rotate_left`` and ``__builtin_stdc_rotate_right``
   for bit rotation of unsigned integers including ``_BitInt`` types. Rotation
   counts are normalized modulo the bit-width and support negative values.

>From 1c832b7711ed63bbfa6557628c76afc34cd0e32b Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:59 +0100
Subject: [PATCH 05/17] [llvm] Add i256 data layout, libcall routing, and
 codegen support

- Add i256:256 alignment to all 64-bit target data layout strings
  (X86-64, AArch64, RISC-V 64, PPC64, SystemZ, etc.)
- Register i256 division/modulo runtime libcalls (udivoi4, etc.) in
  RuntimeLibcalls.td for X86-64 and AArch64
- Add i256 type legalization in X86 and AArch64 backends
  (setOperationAction for div/rem/mul to LibCall)
- Add LegalizeDAG support for 256-bit libcall expansion
- Add llvm-libgcc version script entries for new builtins

Shifts are NOT registered as libcalls (uses default ExpandThroughStack)
to avoid sanitizer link failures.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 llvm-libgcc/gcc_s.ver.in                      | 10 +++
 llvm/include/llvm/IR/RuntimeLibcalls.td       | 70 +++++++++++++++++--
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 69 +++++++++---------
 llvm/lib/CodeGen/TargetLoweringBase.cpp       | 54 ++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  7 ++
 llvm/lib/TargetParser/TargetDataLayout.cpp    | 22 +++---
 7 files changed, 185 insertions(+), 51 deletions(-)

diff --git a/llvm-libgcc/gcc_s.ver.in b/llvm-libgcc/gcc_s.ver.in
index e0bbf0e071553..22dc316fd0fb9 100644
--- a/llvm-libgcc/gcc_s.ver.in
+++ b/llvm-libgcc/gcc_s.ver.in
@@ -67,6 +67,16 @@ GCC_4.3.0 { __bswapdi2; __bswapsi2; __emutls_get_address;                 };
   GCC_3.4.4 { __absvti2;     __addvti3; __mulvti3;   __negvti2;    __subvti3; };
   GCC_4.2.0 { __floatuntidf; __floatuntisf;                                   };
   GCC_7.0.0 { __divmodti4;                                                    };
+
+  // 256-bit integer builtins (compiler-rt only, requires __int256 support)
+  COMPILER_RT_256 {
+    __ashloi3; __ashroi3; __lshroi3; __multi5; __divoi3; __udivoi3; __modoi3;
+    __umodoi3; __negoi2; __cmpoi2; __ucmpoi2; __udivmodoi4; __divmodoi4;
+    __clzoi2; __ctzoi2; __popcountoi2; __parityoi2; __ffsoi2;
+    __absvoi2; __addvoi3; __subvoi3; __mulvoi3; __negvoi2; __muloi5;
+    __fixsfoi; __fixdfoi; __fixunssfoi; __fixunsdfoi;
+    __floatoisf; __floatoidf; __floatunoisf; __floatunoidf;
+  };
 #endif
 
 #if defined(GLOBAL_X86)
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index e4a926d3cb1d3..e35fbbd41e54c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -81,21 +81,21 @@ def ExceptionModelIsSjLj : RuntimeLibcallPredicate<
 //--------------------------------------------------------------------
 
 // Integer
-foreach IntTy = ["I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I16", "I32", "I64", "I128", "I256"] in {
   def SHL_#IntTy : RuntimeLibcall;
   def SRL_#IntTy : RuntimeLibcall;
   def SRA_#IntTy : RuntimeLibcall;
 }
 
-foreach IntTy = ["I8", "I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I8", "I16", "I32", "I64", "I128", "I256"] in {
   def MUL_#IntTy : RuntimeLibcall;
 }
 
-foreach IntTy = ["I32", "I64", "I128" ] in {
+foreach IntTy = ["I32", "I64", "I128", "I256"] in {
   def MULO_#IntTy : RuntimeLibcall;
 }
 
-foreach IntTy = ["I8", "I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I8", "I16", "I32", "I64", "I128", "I256"] in {
   def SDIV_#IntTy : RuntimeLibcall;
   def UDIV_#IntTy : RuntimeLibcall;
   def SREM_#IntTy : RuntimeLibcall;
@@ -108,7 +108,7 @@ foreach IntTy = ["I32", "I64" ] in {
   def NEG_#IntTy : RuntimeLibcall;
 }
 
-foreach IntTy = ["I32", "I64", "I128"] in {
+foreach IntTy = ["I32", "I64", "I128", "I256"] in {
   def CTLZ_#IntTy : RuntimeLibcall;
   def CTPOP_#IntTy : RuntimeLibcall;
 }
@@ -305,12 +305,16 @@ def FPTOSINT_F64_I128 : RuntimeLibcall;
 def FPTOSINT_F80_I32 : RuntimeLibcall;
 def FPTOSINT_F80_I64 : RuntimeLibcall;
 def FPTOSINT_F80_I128 : RuntimeLibcall;
+def FPTOSINT_F80_I256 : RuntimeLibcall;
 def FPTOSINT_F128_I32 : RuntimeLibcall;
 def FPTOSINT_F128_I64 : RuntimeLibcall;
 def FPTOSINT_F128_I128 : RuntimeLibcall;
 def FPTOSINT_PPCF128_I32 : RuntimeLibcall;
 def FPTOSINT_PPCF128_I64 : RuntimeLibcall;
 def FPTOSINT_PPCF128_I128 : RuntimeLibcall;
+def FPTOSINT_F32_I256 : RuntimeLibcall;
+def FPTOSINT_F64_I256 : RuntimeLibcall;
+def FPTOSINT_F128_I256 : RuntimeLibcall;
 def FPTOUINT_F16_I32 : RuntimeLibcall;
 def FPTOUINT_F16_I64 : RuntimeLibcall;
 def FPTOUINT_F16_I128 : RuntimeLibcall;
@@ -323,12 +327,16 @@ def FPTOUINT_F64_I128 : RuntimeLibcall;
 def FPTOUINT_F80_I32 : RuntimeLibcall;
 def FPTOUINT_F80_I64 : RuntimeLibcall;
 def FPTOUINT_F80_I128 : RuntimeLibcall;
+def FPTOUINT_F80_I256 : RuntimeLibcall;
 def FPTOUINT_F128_I32 : RuntimeLibcall;
 def FPTOUINT_F128_I64 : RuntimeLibcall;
 def FPTOUINT_F128_I128 : RuntimeLibcall;
 def FPTOUINT_PPCF128_I32 : RuntimeLibcall;
 def FPTOUINT_PPCF128_I64 : RuntimeLibcall;
 def FPTOUINT_PPCF128_I128 : RuntimeLibcall;
+def FPTOUINT_F32_I256 : RuntimeLibcall;
+def FPTOUINT_F64_I256 : RuntimeLibcall;
+def FPTOUINT_F128_I256 : RuntimeLibcall;
 def SINTTOFP_I32_F16 : RuntimeLibcall;
 def SINTTOFP_I32_F32 : RuntimeLibcall;
 def SINTTOFP_I32_F64 : RuntimeLibcall;
@@ -348,6 +356,10 @@ def SINTTOFP_I128_F64 : RuntimeLibcall;
 def SINTTOFP_I128_F80 : RuntimeLibcall;
 def SINTTOFP_I128_F128 : RuntimeLibcall;
 def SINTTOFP_I128_PPCF128 : RuntimeLibcall;
+def SINTTOFP_I256_F32 : RuntimeLibcall;
+def SINTTOFP_I256_F64 : RuntimeLibcall;
+def SINTTOFP_I256_F80 : RuntimeLibcall;
+def SINTTOFP_I256_F128 : RuntimeLibcall;
 def UINTTOFP_I32_F16 : RuntimeLibcall;
 def UINTTOFP_I32_F32 : RuntimeLibcall;
 def UINTTOFP_I32_F64 : RuntimeLibcall;
@@ -367,6 +379,10 @@ def UINTTOFP_I128_F64 : RuntimeLibcall;
 def UINTTOFP_I128_F80 : RuntimeLibcall;
 def UINTTOFP_I128_F128 : RuntimeLibcall;
 def UINTTOFP_I128_PPCF128 : RuntimeLibcall;
+def UINTTOFP_I256_F32 : RuntimeLibcall;
+def UINTTOFP_I256_F64 : RuntimeLibcall;
+def UINTTOFP_I256_F80 : RuntimeLibcall;
+def UINTTOFP_I256_F128 : RuntimeLibcall;
 def CONVERT_F128_PPCF128 : RuntimeLibcall;
 def CONVERT_PPCF128_F128 : RuntimeLibcall;
 
@@ -926,24 +942,28 @@ def __divhi3 : RuntimeLibcallImpl<SDIV_I16>;
 def __divsi3 : RuntimeLibcallImpl<SDIV_I32>;
 def __divdi3 : RuntimeLibcallImpl<SDIV_I64>;
 def __divti3 : RuntimeLibcallImpl<SDIV_I128>;
+def __divoi3 : RuntimeLibcallImpl<SDIV_I256>;
 
 def __udivqi3 : RuntimeLibcallImpl<UDIV_I8>;
 def __udivhi3 : RuntimeLibcallImpl<UDIV_I16>;
 def __udivsi3 : RuntimeLibcallImpl<UDIV_I32>;
 def __udivdi3 : RuntimeLibcallImpl<UDIV_I64>;
 def __udivti3 : RuntimeLibcallImpl<UDIV_I128>;
+def __udivoi3 : RuntimeLibcallImpl<UDIV_I256>;
 
 def __modqi3 : RuntimeLibcallImpl<SREM_I8>;
 def __modhi3 : RuntimeLibcallImpl<SREM_I16>;
 def __modsi3 : RuntimeLibcallImpl<SREM_I32>;
 def __moddi3 : RuntimeLibcallImpl<SREM_I64>;
 def __modti3 : RuntimeLibcallImpl<SREM_I128>;
+def __modoi3 : RuntimeLibcallImpl<SREM_I256>;
 
 def __umodqi3 : RuntimeLibcallImpl<UREM_I8>;
 def __umodhi3 : RuntimeLibcallImpl<UREM_I16>;
 def __umodsi3 : RuntimeLibcallImpl<UREM_I32>;
 def __umoddi3 : RuntimeLibcallImpl<UREM_I64>;
 def __umodti3 : RuntimeLibcallImpl<UREM_I128>;
+def __umodoi3 : RuntimeLibcallImpl<UREM_I256>;
 
 def __negsi2 : RuntimeLibcallImpl<NEG_I32>;
 def __negdi2 : RuntimeLibcallImpl<NEG_I64>;
@@ -951,10 +971,12 @@ def __negdi2 : RuntimeLibcallImpl<NEG_I64>;
 def __clzsi2 : RuntimeLibcallImpl<CTLZ_I32>;
 def __clzdi2 : RuntimeLibcallImpl<CTLZ_I64>;
 def __clzti2 : RuntimeLibcallImpl<CTLZ_I128>;
+def __clzoi2 : RuntimeLibcallImpl<CTLZ_I256>;
 
 def __popcountsi2 : RuntimeLibcallImpl<CTPOP_I32>;
 def __popcountdi2 : RuntimeLibcallImpl<CTPOP_I64>;
 def __popcountti2 : RuntimeLibcallImpl<CTPOP_I128>;
+def __popcountoi2 : RuntimeLibcallImpl<CTPOP_I256>;
 
 def __addsf3 : RuntimeLibcallImpl<ADD_F32>;
 def __adddf3 : RuntimeLibcallImpl<ADD_F64>;
@@ -1023,15 +1045,19 @@ def __fixhfti : RuntimeLibcallImpl<FPTOSINT_F16_I128>;
 def __fixsfsi : RuntimeLibcallImpl<FPTOSINT_F32_I32>;
 def __fixsfdi : RuntimeLibcallImpl<FPTOSINT_F32_I64>;
 def __fixsfti : RuntimeLibcallImpl<FPTOSINT_F32_I128>;
+def __fixsfoi : RuntimeLibcallImpl<FPTOSINT_F32_I256>;
 def __fixdfsi : RuntimeLibcallImpl<FPTOSINT_F64_I32>;
 def __fixdfdi : RuntimeLibcallImpl<FPTOSINT_F64_I64>;
 def __fixdfti : RuntimeLibcallImpl<FPTOSINT_F64_I128>;
+def __fixdfoi : RuntimeLibcallImpl<FPTOSINT_F64_I256>;
 def __fixxfsi : RuntimeLibcallImpl<FPTOSINT_F80_I32>;
 def __fixxfdi : RuntimeLibcallImpl<FPTOSINT_F80_I64>;
 def __fixxfti : RuntimeLibcallImpl<FPTOSINT_F80_I128>;
+def __fixxfoi : RuntimeLibcallImpl<FPTOSINT_F80_I256>;
 def __fixtfsi : RuntimeLibcallImpl<FPTOSINT_F128_I32>;
 def __fixtfdi_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I64, "__fixtfdi">;
 def __fixtfti_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I128, "__fixtfti">;
+def __fixtfoi_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I256, "__fixtfoi">;
 def __gcc_qtou : RuntimeLibcallImpl<FPTOSINT_PPCF128_I32>;
 def __fixtfdi_ppcf128 : RuntimeLibcallImpl<FPTOSINT_PPCF128_I64, "__fixtfdi">;
 def __fixtfti_ppcf128 : RuntimeLibcallImpl<FPTOSINT_PPCF128_I128, "__fixtfti">;
@@ -1041,15 +1067,19 @@ def __fixunshfti : RuntimeLibcallImpl<FPTOUINT_F16_I128>;
 def __fixunssfsi : RuntimeLibcallImpl<FPTOUINT_F32_I32>;
 def __fixunssfdi : RuntimeLibcallImpl<FPTOUINT_F32_I64>;
 def __fixunssfti : RuntimeLibcallImpl<FPTOUINT_F32_I128>;
+def __fixunssfoi : RuntimeLibcallImpl<FPTOUINT_F32_I256>;
 def __fixunsdfsi : RuntimeLibcallImpl<FPTOUINT_F64_I32>;
 def __fixunsdfdi : RuntimeLibcallImpl<FPTOUINT_F64_I64>;
 def __fixunsdfti : RuntimeLibcallImpl<FPTOUINT_F64_I128>;
+def __fixunsdfoi : RuntimeLibcallImpl<FPTOUINT_F64_I256>;
 def __fixunsxfsi : RuntimeLibcallImpl<FPTOUINT_F80_I32>;
 def __fixunsxfdi : RuntimeLibcallImpl<FPTOUINT_F80_I64>;
 def __fixunsxfti : RuntimeLibcallImpl<FPTOUINT_F80_I128>;
+def __fixunsxfoi : RuntimeLibcallImpl<FPTOUINT_F80_I256>;
 def __fixunstfsi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I32, "__fixunstfsi">;
 def __fixunstfdi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I64, "__fixunstfdi">;
 def __fixunstfti_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I128, "__fixunstfti">;
+def __fixunstfoi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I256, "__fixunstfoi">;
 def __fixunstfsi_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I32, "__fixunstfsi">;
 def __fixunstfdi_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I64, "__fixunstfdi">;
 def __fixunstfti_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I128, "__fixunstfti">;
@@ -1072,6 +1102,10 @@ def __floattidf : RuntimeLibcallImpl<SINTTOFP_I128_F64>;
 def __floattixf : RuntimeLibcallImpl<SINTTOFP_I128_F80>;
 def __floattitf_f128 : RuntimeLibcallImpl<SINTTOFP_I128_F128, "__floattitf">;
 def __floattitf_ppcf128 : RuntimeLibcallImpl<SINTTOFP_I128_PPCF128, "__floattitf">;
+def __floatoisf : RuntimeLibcallImpl<SINTTOFP_I256_F32>;
+def __floatoidf : RuntimeLibcallImpl<SINTTOFP_I256_F64>;
+def __floatoixf : RuntimeLibcallImpl<SINTTOFP_I256_F80>;
+def __floatoitf_f128 : RuntimeLibcallImpl<SINTTOFP_I256_F128, "__floatoitf">;
 def __floatunsihf : RuntimeLibcallImpl<UINTTOFP_I32_F16>;
 def __floatunsisf : RuntimeLibcallImpl<UINTTOFP_I32_F32>;
 def __floatunsidf : RuntimeLibcallImpl<UINTTOFP_I32_F64>;
@@ -1091,6 +1125,10 @@ def __floatuntidf : RuntimeLibcallImpl<UINTTOFP_I128_F64>;
 def __floatuntixf : RuntimeLibcallImpl<UINTTOFP_I128_F80>;
 def __floatuntitf_f128 : RuntimeLibcallImpl<UINTTOFP_I128_F128, "__floatuntitf">;
 def __floatuntitf_ppcf128 : RuntimeLibcallImpl<UINTTOFP_I128_PPCF128, "__floatuntitf">;
+def __floatunoisf : RuntimeLibcallImpl<UINTTOFP_I256_F32>;
+def __floatunoidf : RuntimeLibcallImpl<UINTTOFP_I256_F64>;
+def __floatunoixf : RuntimeLibcallImpl<UINTTOFP_I256_F80>;
+def __floatunoitf_f128 : RuntimeLibcallImpl<UINTTOFP_I256_F128, "__floatunoitf">;
 def __extendkftf2 : RuntimeLibcallImpl<CONVERT_F128_PPCF128>;
 def __trunctfkf2 : RuntimeLibcallImpl<CONVERT_PPCF128_F128>;
 
@@ -1788,6 +1826,17 @@ defset list<RuntimeLibcallImpl> Int128RTLibcalls = {
   def __multi3 : RuntimeLibcallImpl<MUL_I128>;
 }
 
+defset list<RuntimeLibcallImpl> Int256RTLibcalls = {
+  // i256 libcalls are intentionally NOT registered.  The backend expands i256
+  // operations inline (shifts via ExpandToParts, multiplication via
+  // forceExpandMultiply).  Registering libcalls like __multi5 causes an ABI
+  // mismatch on targets where __int256 uses indirect passing (sret/byval):
+  // the backend generates calls with split-scalar convention (4 x i64 in regs)
+  // while the compiled builtins expect indirect pointers.  Additionally,
+  // registering libcalls causes link failures in sanitizer runtimes that embed
+  // UBSan but don't link against compiler-rt builtins.
+}
+
 //--------------------------------------------------------------------
 // compiler-rt only, not available by default
 //--------------------------------------------------------------------
@@ -1800,6 +1849,10 @@ defset list<RuntimeLibcallImpl> CompilerRTOnlyInt128Libcalls = {
   def __muloti4 : RuntimeLibcallImpl<MULO_I128>;
 }
 
+defset list<RuntimeLibcallImpl> CompilerRTOnlyInt256Libcalls = {
+  def __muloi5 : RuntimeLibcallImpl<MULO_I256>;
+}
+
 //--------------------------------------------------------------------
 // Define implementation other libcalls
 //--------------------------------------------------------------------
@@ -1976,7 +2029,9 @@ defvar DefaultRuntimeLibcallImpls_f128 =
 defvar DefaultRuntimeLibcallImplsBaseList =
 !listremove(
   !listremove(
-    !listremove(AllDefaultRuntimeLibcallImpls, Int128RTLibcalls),
+    !listremove(
+      !listremove(AllDefaultRuntimeLibcallImpls, Int128RTLibcalls),
+                  Int256RTLibcalls),
                 DefaultRuntimeLibcallImpls_f80),
                 DefaultRuntimeLibcallImpls_ppcf128);
 
@@ -2170,6 +2225,7 @@ def AArch64SystemLibrary : SystemRuntimeLibrary<
        LibmHasFrexpF128, LibmHasLdexpF128,
        AArch64LibcallImpls,
        LibcallImpls<(add Int128RTLibcalls), isAArch64_ILP64>,
+       Int256RTLibcalls,
        LibcallImpls<(add bzero), isOSDarwin>,
        DarwinExp10, DarwinSinCosStret, DarwinMemsetPattern,
        MacOSUnlockedIO,
@@ -3419,7 +3475,7 @@ def X86_32SystemLibrary
 
 def X86_64SystemLibrary
     : SystemRuntimeLibrary<isX86_64,
-      (add X86CommonLibcalls, Int128RTLibcalls)>;
+      (add X86CommonLibcalls, Int128RTLibcalls, Int256RTLibcalls)>;
 
 //===----------------------------------------------------------------------===//
 // XCore Runtime Libcalls
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index eb20e7982a102..817f98cec23b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -153,7 +153,8 @@ class SelectionDAGLegalize {
 
   SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
                            RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32,
-                           RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128);
+                           RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128,
+                           RTLIB::Libcall Call_I256);
   void ExpandArgFPLibCall(SDNode *Node,
                           RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
                           RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
@@ -161,7 +162,8 @@ class SelectionDAGLegalize {
                           SmallVectorImpl<SDValue> &Results);
   SDValue ExpandBitCountingLibCall(SDNode *Node, RTLIB::Libcall CallI32,
                                    RTLIB::Libcall CallI64,
-                                   RTLIB::Libcall CallI128);
+                                   RTLIB::Libcall CallI128,
+                                   RTLIB::Libcall CallI256);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   SDValue ExpandSincosStretLibCall(SDNode *Node) const;
@@ -2269,12 +2271,10 @@ void SelectionDAGLegalize::ExpandFastFPLibCall(
   ExpandFPLibCall(Node, LC, Results);
 }
 
-SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
-                                               RTLIB::Libcall Call_I8,
-                                               RTLIB::Libcall Call_I16,
-                                               RTLIB::Libcall Call_I32,
-                                               RTLIB::Libcall Call_I64,
-                                               RTLIB::Libcall Call_I128) {
+SDValue SelectionDAGLegalize::ExpandIntLibCall(
+    SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+    RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32, RTLIB::Libcall Call_I64,
+    RTLIB::Libcall Call_I128, RTLIB::Libcall Call_I256) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
@@ -2283,6 +2283,9 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
   case MVT::i32:  LC = Call_I32; break;
   case MVT::i64:  LC = Call_I64; break;
   case MVT::i128: LC = Call_I128; break;
+  case MVT::i256:
+    LC = Call_I256;
+    break;
   }
   return ExpandLibCall(LC, Node, isSigned).first;
 }
@@ -2305,7 +2308,7 @@ void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
 
 SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
     SDNode *Node, RTLIB::Libcall CallI32, RTLIB::Libcall CallI64,
-    RTLIB::Libcall CallI128) {
+    RTLIB::Libcall CallI128, RTLIB::Libcall CallI256) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default:
@@ -2319,6 +2322,9 @@ SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
   case MVT::i128:
     LC = CallI128;
     break;
+  case MVT::i256:
+    LC = CallI256;
+    break;
   }
 
   // Bit-counting libcalls have one unsigned argument and return `int`.
@@ -5325,28 +5331,24 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::SREM:
-    Results.push_back(ExpandIntLibCall(Node, true,
-                                       RTLIB::SREM_I8,
-                                       RTLIB::SREM_I16, RTLIB::SREM_I32,
-                                       RTLIB::SREM_I64, RTLIB::SREM_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, true, RTLIB::SREM_I8, RTLIB::SREM_I16, RTLIB::SREM_I32,
+        RTLIB::SREM_I64, RTLIB::SREM_I128, RTLIB::SREM_I256));
     break;
   case ISD::UREM:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::UREM_I8,
-                                       RTLIB::UREM_I16, RTLIB::UREM_I32,
-                                       RTLIB::UREM_I64, RTLIB::UREM_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::UREM_I8, RTLIB::UREM_I16, RTLIB::UREM_I32,
+        RTLIB::UREM_I64, RTLIB::UREM_I128, RTLIB::UREM_I256));
     break;
   case ISD::SDIV:
-    Results.push_back(ExpandIntLibCall(Node, true,
-                                       RTLIB::SDIV_I8,
-                                       RTLIB::SDIV_I16, RTLIB::SDIV_I32,
-                                       RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, true, RTLIB::SDIV_I8, RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+        RTLIB::SDIV_I64, RTLIB::SDIV_I128, RTLIB::SDIV_I256));
     break;
   case ISD::UDIV:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::UDIV_I8,
-                                       RTLIB::UDIV_I16, RTLIB::UDIV_I32,
-                                       RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::UDIV_I8, RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+        RTLIB::UDIV_I64, RTLIB::UDIV_I128, RTLIB::UDIV_I256));
     break;
   case ISD::SDIVREM:
   case ISD::UDIVREM:
@@ -5354,18 +5356,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandDivRemLibCall(Node, Results);
     break;
   case ISD::MUL:
-    Results.push_back(ExpandIntLibCall(Node, false,
-                                       RTLIB::MUL_I8,
-                                       RTLIB::MUL_I16, RTLIB::MUL_I32,
-                                       RTLIB::MUL_I64, RTLIB::MUL_I128));
+    Results.push_back(ExpandIntLibCall(
+        Node, false, RTLIB::MUL_I8, RTLIB::MUL_I16, RTLIB::MUL_I32,
+        RTLIB::MUL_I64, RTLIB::MUL_I128, RTLIB::MUL_I256));
     break;
   case ISD::CTLZ_ZERO_UNDEF:
-    Results.push_back(ExpandBitCountingLibCall(
-        Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128));
+    Results.push_back(
+        ExpandBitCountingLibCall(Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64,
+                                 RTLIB::CTLZ_I128, RTLIB::CTLZ_I256));
     break;
   case ISD::CTPOP:
-    Results.push_back(ExpandBitCountingLibCall(
-        Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64, RTLIB::CTPOP_I128));
+    Results.push_back(
+        ExpandBitCountingLibCall(Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64,
+                                 RTLIB::CTPOP_I128, RTLIB::CTPOP_I256));
     break;
   case ISD::RESET_FPENV: {
     // It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index cc5a4219536ac..355063a91ec40 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -130,6 +130,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSHL(EVT VT) {
     return RTLIB::SHL_I64;
   if (VT == MVT::i128)
     return RTLIB::SHL_I128;
+  if (VT == MVT::i256)
+    return RTLIB::SHL_I256;
 
   return RTLIB::UNKNOWN_LIBCALL;
 }
@@ -143,6 +145,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSRL(EVT VT) {
     return RTLIB::SRL_I64;
   if (VT == MVT::i128)
     return RTLIB::SRL_I128;
+  if (VT == MVT::i256)
+    return RTLIB::SRL_I256;
 
   return RTLIB::UNKNOWN_LIBCALL;
 }
@@ -156,6 +160,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSRA(EVT VT) {
     return RTLIB::SRA_I64;
   if (VT == MVT::i128)
     return RTLIB::SRA_I128;
+  if (VT == MVT::i256)
+    return RTLIB::SRA_I256;
 
   return RTLIB::UNKNOWN_LIBCALL;
 }
@@ -169,6 +175,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getMUL(EVT VT) {
     return RTLIB::MUL_I64;
   if (VT == MVT::i128)
     return RTLIB::MUL_I128;
+  if (VT == MVT::i256)
+    return RTLIB::MUL_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -179,6 +187,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getMULO(EVT VT) {
     return RTLIB::MULO_I64;
   if (VT == MVT::i128)
     return RTLIB::MULO_I128;
+  if (VT == MVT::i256)
+    return RTLIB::MULO_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -191,6 +201,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSDIV(EVT VT) {
     return RTLIB::SDIV_I64;
   if (VT == MVT::i128)
     return RTLIB::SDIV_I128;
+  if (VT == MVT::i256)
+    return RTLIB::SDIV_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -203,6 +215,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getUDIV(EVT VT) {
     return RTLIB::UDIV_I64;
   if (VT == MVT::i128)
     return RTLIB::UDIV_I128;
+  if (VT == MVT::i256)
+    return RTLIB::UDIV_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -215,6 +229,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSREM(EVT VT) {
     return RTLIB::SREM_I64;
   if (VT == MVT::i128)
     return RTLIB::SREM_I128;
+  if (VT == MVT::i256)
+    return RTLIB::SREM_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -227,6 +243,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getUREM(EVT VT) {
     return RTLIB::UREM_I64;
   if (VT == MVT::i128)
     return RTLIB::UREM_I128;
+  if (VT == MVT::i256)
+    return RTLIB::UREM_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -237,6 +255,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getCTPOP(EVT VT) {
     return RTLIB::CTPOP_I64;
   if (VT == MVT::i128)
     return RTLIB::CTPOP_I128;
+  if (VT == MVT::i256)
+    return RTLIB::CTPOP_I256;
   return RTLIB::UNKNOWN_LIBCALL;
 }
 
@@ -356,6 +376,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
       return FPTOSINT_F32_I64;
     if (RetVT == MVT::i128)
       return FPTOSINT_F32_I128;
+    if (RetVT == MVT::i256)
+      return FPTOSINT_F32_I256;
   } else if (OpVT == MVT::f64) {
     if (RetVT == MVT::i32)
       return FPTOSINT_F64_I32;
@@ -363,6 +385,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
       return FPTOSINT_F64_I64;
     if (RetVT == MVT::i128)
       return FPTOSINT_F64_I128;
+    if (RetVT == MVT::i256)
+      return FPTOSINT_F64_I256;
   } else if (OpVT == MVT::f80) {
     if (RetVT == MVT::i32)
       return FPTOSINT_F80_I32;
@@ -370,6 +394,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
       return FPTOSINT_F80_I64;
     if (RetVT == MVT::i128)
       return FPTOSINT_F80_I128;
+    if (RetVT == MVT::i256)
+      return FPTOSINT_F80_I256;
   } else if (OpVT == MVT::f128) {
     if (RetVT == MVT::i32)
       return FPTOSINT_F128_I32;
@@ -377,6 +403,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
       return FPTOSINT_F128_I64;
     if (RetVT == MVT::i128)
       return FPTOSINT_F128_I128;
+    if (RetVT == MVT::i256)
+      return FPTOSINT_F128_I256;
   } else if (OpVT == MVT::ppcf128) {
     if (RetVT == MVT::i32)
       return FPTOSINT_PPCF128_I32;
@@ -405,6 +433,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
       return FPTOUINT_F32_I64;
     if (RetVT == MVT::i128)
       return FPTOUINT_F32_I128;
+    if (RetVT == MVT::i256)
+      return FPTOUINT_F32_I256;
   } else if (OpVT == MVT::f64) {
     if (RetVT == MVT::i32)
       return FPTOUINT_F64_I32;
@@ -412,6 +442,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
       return FPTOUINT_F64_I64;
     if (RetVT == MVT::i128)
       return FPTOUINT_F64_I128;
+    if (RetVT == MVT::i256)
+      return FPTOUINT_F64_I256;
   } else if (OpVT == MVT::f80) {
     if (RetVT == MVT::i32)
       return FPTOUINT_F80_I32;
@@ -419,6 +451,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
       return FPTOUINT_F80_I64;
     if (RetVT == MVT::i128)
       return FPTOUINT_F80_I128;
+    if (RetVT == MVT::i256)
+      return FPTOUINT_F80_I256;
   } else if (OpVT == MVT::f128) {
     if (RetVT == MVT::i32)
       return FPTOUINT_F128_I32;
@@ -426,6 +460,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
       return FPTOUINT_F128_I64;
     if (RetVT == MVT::i128)
       return FPTOUINT_F128_I128;
+    if (RetVT == MVT::i256)
+      return FPTOUINT_F128_I256;
   } else if (OpVT == MVT::ppcf128) {
     if (RetVT == MVT::i32)
       return FPTOUINT_PPCF128_I32;
@@ -481,6 +517,15 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
       return SINTTOFP_I128_F128;
     if (RetVT == MVT::ppcf128)
       return SINTTOFP_I128_PPCF128;
+  } else if (OpVT == MVT::i256) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I256_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I256_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I256_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I256_F128;
   }
   return UNKNOWN_LIBCALL;
 }
@@ -529,6 +574,15 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
       return UINTTOFP_I128_F128;
     if (RetVT == MVT::ppcf128)
       return UINTTOFP_I128_PPCF128;
+  } else if (OpVT == MVT::i256) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I256_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I256_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I256_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I256_F128;
   }
   return UNKNOWN_LIBCALL;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 01e07a70aaaf4..9cdbc4417e636 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1241,7 +1241,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setHasExtractBitsInsn(true);
 
-  setMaxDivRemBitWidthSupported(128);
+  setMaxDivRemBitWidthSupported(256);
+
+  setMaxLargeFPConvertBitWidthSupported(256);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   if (Subtarget->hasSME())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2188f6466682b..7f98d9f2bb14e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -173,6 +173,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   else
     setMaxAtomicSizeInBitsSupported(32);
 
+  // Note: i256 div/rem and FP conversions are intentionally NOT routed to
+  // libcalls on x86-64.  The x86-64 SysV ABI classifies __int256 as MEMORY
+  // (> 2 eightbytes), so the frontend uses indirect passing (sret/byval).
+  // Backend-generated libcalls pass i256 as a split scalar (4 x i64 in
+  // registers), creating an ABI mismatch with the compiled builtins.
+  // Instead, ExpandLargeDivRem and ExpandLargeFPConvert expand these
+  // operations at the IR level.
   setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
 
   setMaxLargeFPConvertBitWidthSupported(128);
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index b985c1eec4244..a0e9d6e6aea75 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -85,8 +85,8 @@ static std::string computeAArch64DataLayout(const Triple &TT) {
     if (TT.getArch() == Triple::aarch64_32)
       return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
              "n32:64-S128-Fn32";
-    return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-"
-           "Fn32";
+    return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+           "n32:64-S128-Fn32";
   }
   if (TT.isOSBinFormatCOFF())
     return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:"
@@ -200,9 +200,10 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) {
   // 32 bit registers are always available and the stack is at least 64 bit
   // aligned. On N64 64 bit registers are also available and the stack is
   // 128 bit aligned.
-  if (ABI == MipsABI::N64 || ABI == MipsABI::N32)
-    Ret += "-i128:128-n32:64-S128";
-  else
+  if (ABI == MipsABI::N64 || ABI == MipsABI::N32) {
+    Ret += "-i128:128";
+    Ret += "-n32:64-S128";
+  } else
     Ret += "-n32-S64";
 
   return Ret;
@@ -242,9 +243,10 @@ static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) {
   Ret += "-i64:64";
 
   // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (is64Bit)
-    Ret += "-i128:128-n32:64";
-  else
+  if (is64Bit) {
+    Ret += "-i128:128";
+    Ret += "-n32:64";
+  } else
     Ret += "-n32";
 
   // The ABI alignment for doubles on AIX is 4 bytes.
@@ -410,9 +412,9 @@ static std::string computeX86DataLayout(const Triple &TT) {
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
   // 128 bit integers are not specified in the 32-bit ABIs but are used
   // internally for lowering f128, so we match the alignment to that.
-  if (Is64Bit || TT.isOSWindows())
+  if (Is64Bit || TT.isOSWindows()) {
     Ret += "-i64:64-i128:128";
-  else if (TT.isOSIAMCU())
+  } else if (TT.isOSIAMCU())
     Ret += "-i64:32-f64:32";
   else
     Ret += "-i128:128-f64:32:64";

>From 7e1a66ca5b3845c81262a2a929b04004fa1b3e0f Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:39:26 +0100
Subject: [PATCH 06/17] [llvm][test] Add and update i256 codegen tests

Add new i256-specific tests and regenerate affected existing tests:
- AArch64: bitcount (NEON/SVE/CSSC), comparisons, division, multiply,
  shifts, wide-scalar shift legalization, GlobalISel multiway splits
- X86: comparisons, division, multiply, shifts (i128/i256/i512),
  div-rem recomposition, overflow multiply, APX i1024 multiply,
  expand-large-fp (fptosi/fptoui/sitofp/uitofp for i129+), various
  regressions (dagcombine-cse, scheduler-backtracking, pr38539)
- RISC-V: i256 arithmetic (add, sub, mul, div, shifts)

Existing tests regenerated with update_llc_test_checks.py to reflect
i256 data layout alignment changes.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 .../GlobalISel/split-wide-shifts-multiway.ll  |  210 +-
 llvm/test/CodeGen/AArch64/div-i256.ll         | 1091 +---
 llvm/test/CodeGen/AArch64/shift-i256.ll       |   29 +-
 ...lar-shift-by-byte-multiple-legalization.ll |  102 +-
 .../AArch64/wide-scalar-shift-legalization.ll |   98 +-
 llvm/test/CodeGen/RISCV/i256-arith.ll         | 1442 +++++
 llvm/test/CodeGen/X86/apx/mul-i1024.ll        | 3014 ++++-----
 llvm/test/CodeGen/X86/bittest-big-integer.ll  |    2 +-
 llvm/test/CodeGen/X86/cmp-i256.ll             |  450 ++
 llvm/test/CodeGen/X86/dagcombine-cse.ll       |   38 +-
 llvm/test/CodeGen/X86/div-i256.ll             | 5475 +++++++++++++++++
 .../CodeGen/X86/expand-large-fp-optnone.ll    |  240 +-
 llvm/test/CodeGen/X86/fp-i129.ll              |   96 +-
 llvm/test/CodeGen/X86/i128-sdiv.ll            |  327 +-
 llvm/test/CodeGen/X86/memfold-mov32r0.ll      |    4 +-
 llvm/test/CodeGen/X86/mul-i1024.ll            | 1686 ++---
 llvm/test/CodeGen/X86/mul-i512.ll             |  392 +-
 .../CodeGen/X86/scheduler-backtracking.ll     |  168 +-
 .../CodeGen/X86/shift-i256-narrow-amount.ll   |  382 ++
 llvm/test/CodeGen/X86/shift-i256.ll           |  122 +-
 llvm/test/CodeGen/X86/shift-i512.ll           | 2736 ++++----
 llvm/test/CodeGen/X86/smul-with-overflow.ll   |  218 +-
 .../X86/smulo-128-legalisation-lowering.ll    |  290 +-
 llvm/test/CodeGen/X86/udivmodei5.ll           | 4968 ++++++++++++++-
 llvm/test/CodeGen/X86/umul-with-overflow.ll   |  186 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 4510 ++++++++------
 .../X86/wide-scalar-shift-legalization.ll     | 1066 ++--
 ...ad-of-small-alloca-with-zero-upper-half.ll |  778 ++-
 .../CodeGen/X86/widen-load-of-small-alloca.ll |  222 +-
 .../X86/expand-large-fp-convert-fptosi129.ll  |  186 +-
 .../X86/expand-large-fp-convert-fptoui129.ll  |  162 +-
 .../X86/expand-large-fp-convert-si129tofp.ll  |  553 +-
 .../X86/expand-large-fp-convert-ui129tofp.ll  |  553 +-
 .../X86/expand-large-fp-optnone.ll            |   84 +-
 .../Transforms/ExpandIRInsts/X86/sdiv129.ll   |    4 +-
 .../Transforms/ExpandIRInsts/X86/srem129.ll   |    4 +-
 .../Transforms/ExpandIRInsts/X86/udiv129.ll   |    4 +-
 .../Transforms/ExpandIRInsts/X86/urem129.ll   |    4 +-
 38 files changed, 21229 insertions(+), 10667 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/i256-arith.ll
 create mode 100644 llvm/test/CodeGen/X86/cmp-i256.ll
 create mode 100644 llvm/test/CodeGen/X86/div-i256.ll
 create mode 100644 llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index d669c49cb019b..e477a78d546e9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -7,8 +7,13 @@ target triple = "arm64-apple-macosx14.0.0"
 define void @test_shl_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_shl_i512:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #128
-; SDAG-NEXT:    .cfi_def_cfa_offset 128
+; SDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #144
+; SDAG-NEXT:    mov x29, sp
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
+; SDAG-NEXT:    .cfi_offset w30, -8
+; SDAG-NEXT:    .cfi_offset w29, -16
 ; SDAG-NEXT:    ldp x9, x8, [x1, #48]
 ; SDAG-NEXT:    movi.2d v0, #0000000000000000
 ; SDAG-NEXT:    ldp q1, q2, [x1]
@@ -64,7 +69,8 @@ define void @test_shl_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    orr x9, x11, x16
 ; SDAG-NEXT:    stp x13, x10, [x0, #16]
 ; SDAG-NEXT:    stp x8, x9, [x0]
-; SDAG-NEXT:    add sp, sp, #128
+; SDAG-NEXT:    mov sp, x29
+; SDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_shl_i512:
@@ -360,8 +366,13 @@ entry:
 define void @test_lshr_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_lshr_i512:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #128
-; SDAG-NEXT:    .cfi_def_cfa_offset 128
+; SDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #144
+; SDAG-NEXT:    mov x29, sp
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
+; SDAG-NEXT:    .cfi_offset w30, -8
+; SDAG-NEXT:    .cfi_offset w29, -16
 ; SDAG-NEXT:    ldp x9, x8, [x1, #48]
 ; SDAG-NEXT:    movi.2d v0, #0000000000000000
 ; SDAG-NEXT:    ldp q1, q2, [x1]
@@ -416,7 +427,8 @@ define void @test_lshr_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    orr x9, x12, x10
 ; SDAG-NEXT:    orr x8, x8, x15
 ; SDAG-NEXT:    stp x9, x8, [x0]
-; SDAG-NEXT:    add sp, sp, #128
+; SDAG-NEXT:    mov sp, x29
+; SDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_lshr_i512:
@@ -706,8 +718,13 @@ entry:
 define void @test_ashr_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_ashr_i512:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #128
-; SDAG-NEXT:    .cfi_def_cfa_offset 128
+; SDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #144
+; SDAG-NEXT:    mov x29, sp
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
+; SDAG-NEXT:    .cfi_offset w30, -8
+; SDAG-NEXT:    .cfi_offset w29, -16
 ; SDAG-NEXT:    ldp x9, x8, [x1, #48]
 ; SDAG-NEXT:    mov x11, sp
 ; SDAG-NEXT:    ldp q0, q1, [x1]
@@ -764,7 +781,8 @@ define void @test_ashr_i512(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    orr x9, x12, x10
 ; SDAG-NEXT:    orr x8, x8, x15
 ; SDAG-NEXT:    stp x9, x8, [x0]
-; SDAG-NEXT:    add sp, sp, #128
+; SDAG-NEXT:    mov sp, x29
+; SDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_ashr_i512:
@@ -1086,14 +1104,16 @@ entry:
 define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_shl_i1024:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #352
-; SDAG-NEXT:    stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x29, x30, [sp, #336] ; 16-byte Folded Spill
-; SDAG-NEXT:    .cfi_def_cfa_offset 352
+; SDAG-NEXT:    stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #256
+; SDAG-NEXT:    stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT:    add x29, sp, #80
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
 ; SDAG-NEXT:    .cfi_offset w30, -8
 ; SDAG-NEXT:    .cfi_offset w29, -16
 ; SDAG-NEXT:    .cfi_offset w19, -24
@@ -1120,7 +1140,6 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsr x9, x8, #3
 ; SDAG-NEXT:    stp q0, q0, [sp]
 ; SDAG-NEXT:    stp q0, q0, [sp, #32]
-; SDAG-NEXT:    ldp x29, x30, [sp, #336] ; 16-byte Folded Reload
 ; SDAG-NEXT:    and x9, x9, #0x78
 ; SDAG-NEXT:    stp q0, q0, [sp, #64]
 ; SDAG-NEXT:    stp q0, q0, [sp, #96]
@@ -1161,33 +1180,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsr x3, x3, x4
 ; SDAG-NEXT:    lsr x5, x5, x4
 ; SDAG-NEXT:    lsr x7, x7, x4
-; SDAG-NEXT:    lsr x22, x22, #1
 ; SDAG-NEXT:    lsr x25, x25, x4
 ; SDAG-NEXT:    lsr x4, x28, x4
 ; SDAG-NEXT:    orr x1, x1, x20
 ; SDAG-NEXT:    lsl x20, x23, x8
 ; SDAG-NEXT:    lsl x23, x24, x8
+; SDAG-NEXT:    lsr x22, x22, #1
 ; SDAG-NEXT:    lsr x28, x26, #1
-; SDAG-NEXT:    lsr x22, x22, x10
 ; SDAG-NEXT:    lsl x24, x27, x8
-; SDAG-NEXT:    orr x4, x23, x4
 ; SDAG-NEXT:    lsl x6, x6, x8
-; SDAG-NEXT:    lsl x2, x2, x8
+; SDAG-NEXT:    orr x4, x23, x4
+; SDAG-NEXT:    lsr x22, x22, x10
+; SDAG-NEXT:    lsl x13, x13, x8
 ; SDAG-NEXT:    lsr x27, x28, x10
 ; SDAG-NEXT:    stp x4, x1, [x0, #112]
 ; SDAG-NEXT:    lsl x1, x26, x8
 ; SDAG-NEXT:    orr x20, x20, x22
 ; SDAG-NEXT:    lsr x4, x9, #1
-; SDAG-NEXT:    lsl x13, x13, x8
+; SDAG-NEXT:    lsl x14, x14, x8
 ; SDAG-NEXT:    orr x22, x24, x27
 ; SDAG-NEXT:    orr x1, x1, x25
 ; SDAG-NEXT:    stp x21, x20, [x0, #80]
-; SDAG-NEXT:    lsr x20, x17, #1
 ; SDAG-NEXT:    stp x1, x22, [x0, #96]
 ; SDAG-NEXT:    lsr x1, x11, #1
+; SDAG-NEXT:    lsr x20, x17, #1
 ; SDAG-NEXT:    lsr x21, x12, #1
-; SDAG-NEXT:    lsl x14, x14, x8
 ; SDAG-NEXT:    lsl x15, x15, x8
+; SDAG-NEXT:    lsl x2, x2, x8
 ; SDAG-NEXT:    lsr x20, x20, x10
 ; SDAG-NEXT:    lsl x17, x17, x8
 ; SDAG-NEXT:    orr x6, x6, x7
@@ -1196,25 +1215,26 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsr x1, x1, x10
 ; SDAG-NEXT:    lsl x11, x11, x8
 ; SDAG-NEXT:    lsr x10, x4, x10
-; SDAG-NEXT:    stp x6, x19, [x0, #64]
-; SDAG-NEXT:    orr x2, x2, x20
 ; SDAG-NEXT:    lsl x8, x9, x8
+; SDAG-NEXT:    orr x2, x2, x20
 ; SDAG-NEXT:    orr x17, x17, x5
-; SDAG-NEXT:    ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x15, x15, x7
-; SDAG-NEXT:    ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x12, x12, x3
-; SDAG-NEXT:    ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x14, x14, x1
-; SDAG-NEXT:    ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x11, x11, x16
-; SDAG-NEXT:    ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x9, x13, x10
+; SDAG-NEXT:    stp x6, x19, [x0, #64]
 ; SDAG-NEXT:    stp x17, x2, [x0, #48]
 ; SDAG-NEXT:    stp x12, x15, [x0, #32]
 ; SDAG-NEXT:    stp x11, x14, [x0, #16]
 ; SDAG-NEXT:    stp x8, x9, [x0]
-; SDAG-NEXT:    add sp, sp, #352
+; SDAG-NEXT:    sub sp, x29, #80
+; SDAG-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_shl_i1024:
@@ -2354,23 +2374,28 @@ entry:
 define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_lshr_i1024:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #336
-; SDAG-NEXT:    stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT:    .cfi_def_cfa_offset 336
-; SDAG-NEXT:    .cfi_offset w19, -8
-; SDAG-NEXT:    .cfi_offset w20, -16
-; SDAG-NEXT:    .cfi_offset w21, -24
-; SDAG-NEXT:    .cfi_offset w22, -32
-; SDAG-NEXT:    .cfi_offset w23, -40
-; SDAG-NEXT:    .cfi_offset w24, -48
-; SDAG-NEXT:    .cfi_offset w25, -56
-; SDAG-NEXT:    .cfi_offset w26, -64
-; SDAG-NEXT:    .cfi_offset w27, -72
-; SDAG-NEXT:    .cfi_offset w28, -80
+; SDAG-NEXT:    stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #256
+; SDAG-NEXT:    stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT:    add x29, sp, #80
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
+; SDAG-NEXT:    .cfi_offset w30, -8
+; SDAG-NEXT:    .cfi_offset w29, -16
+; SDAG-NEXT:    .cfi_offset w19, -24
+; SDAG-NEXT:    .cfi_offset w20, -32
+; SDAG-NEXT:    .cfi_offset w21, -40
+; SDAG-NEXT:    .cfi_offset w22, -48
+; SDAG-NEXT:    .cfi_offset w23, -56
+; SDAG-NEXT:    .cfi_offset w24, -64
+; SDAG-NEXT:    .cfi_offset w25, -72
+; SDAG-NEXT:    .cfi_offset w26, -80
+; SDAG-NEXT:    .cfi_offset w27, -88
+; SDAG-NEXT:    .cfi_offset w28, -96
 ; SDAG-NEXT:    ldp x8, x9, [x1, #112]
 ; SDAG-NEXT:    movi.2d v0, #0000000000000000
 ; SDAG-NEXT:    ldp q1, q2, [x1]
@@ -2435,7 +2460,6 @@ define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsl x1, x27, x1
 ; SDAG-NEXT:    lsl x23, x23, x15
 ; SDAG-NEXT:    orr x5, x22, x5
-; SDAG-NEXT:    ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x19, x25, x19
 ; SDAG-NEXT:    lsl x25, x26, #1
 ; SDAG-NEXT:    orr x20, x23, x20
@@ -2452,32 +2476,34 @@ define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsl x1, x9, #1
 ; SDAG-NEXT:    stp x7, x26, [x0, #112]
 ; SDAG-NEXT:    lsl x7, x10, #1
+; SDAG-NEXT:    lsl x5, x12, #1
 ; SDAG-NEXT:    orr x3, x3, x21
 ; SDAG-NEXT:    orr x13, x19, x13
-; SDAG-NEXT:    lsl x5, x12, #1
 ; SDAG-NEXT:    lsr x9, x9, x8
 ; SDAG-NEXT:    stp x13, x3, [x0, #48]
 ; SDAG-NEXT:    lsl x13, x1, x15
 ; SDAG-NEXT:    lsr x23, x23, x8
+; SDAG-NEXT:    stp x4, x6, [x0, #64]
 ; SDAG-NEXT:    lsr x12, x12, x8
+; SDAG-NEXT:    lsl x4, x5, x15
 ; SDAG-NEXT:    lsr x8, x10, x8
 ; SDAG-NEXT:    lsl x10, x7, x15
-; SDAG-NEXT:    stp x4, x6, [x0, #64]
-; SDAG-NEXT:    lsl x4, x5, x15
 ; SDAG-NEXT:    orr x9, x9, x17
 ; SDAG-NEXT:    orr x11, x13, x11
-; SDAG-NEXT:    ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
-; SDAG-NEXT:    stp x11, x9, [x0, #16]
-; SDAG-NEXT:    orr x9, x10, x23
 ; SDAG-NEXT:    orr x12, x12, x2
-; SDAG-NEXT:    ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x16, x4, x16
-; SDAG-NEXT:    ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
+; SDAG-NEXT:    stp x11, x9, [x0, #16]
 ; SDAG-NEXT:    orr x8, x8, x14
-; SDAG-NEXT:    ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
+; SDAG-NEXT:    orr x9, x10, x23
 ; SDAG-NEXT:    stp x16, x12, [x0, #32]
 ; SDAG-NEXT:    stp x9, x8, [x0]
-; SDAG-NEXT:    add sp, sp, #336
+; SDAG-NEXT:    sub sp, x29, #80
+; SDAG-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_lshr_i1024:
@@ -3574,23 +3600,28 @@ entry:
 define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-LABEL: test_ashr_i1024:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    sub sp, sp, #336
-; SDAG-NEXT:    stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT:    stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT:    .cfi_def_cfa_offset 336
-; SDAG-NEXT:    .cfi_offset w19, -8
-; SDAG-NEXT:    .cfi_offset w20, -16
-; SDAG-NEXT:    .cfi_offset w21, -24
-; SDAG-NEXT:    .cfi_offset w22, -32
-; SDAG-NEXT:    .cfi_offset w23, -40
-; SDAG-NEXT:    .cfi_offset w24, -48
-; SDAG-NEXT:    .cfi_offset w25, -56
-; SDAG-NEXT:    .cfi_offset w26, -64
-; SDAG-NEXT:    .cfi_offset w27, -72
-; SDAG-NEXT:    .cfi_offset w28, -80
+; SDAG-NEXT:    stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT:    sub x9, sp, #256
+; SDAG-NEXT:    stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT:    add x29, sp, #80
+; SDAG-NEXT:    and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT:    .cfi_def_cfa w29, 16
+; SDAG-NEXT:    .cfi_offset w30, -8
+; SDAG-NEXT:    .cfi_offset w29, -16
+; SDAG-NEXT:    .cfi_offset w19, -24
+; SDAG-NEXT:    .cfi_offset w20, -32
+; SDAG-NEXT:    .cfi_offset w21, -40
+; SDAG-NEXT:    .cfi_offset w22, -48
+; SDAG-NEXT:    .cfi_offset w23, -56
+; SDAG-NEXT:    .cfi_offset w24, -64
+; SDAG-NEXT:    .cfi_offset w25, -72
+; SDAG-NEXT:    .cfi_offset w26, -80
+; SDAG-NEXT:    .cfi_offset w27, -88
+; SDAG-NEXT:    .cfi_offset w28, -96
 ; SDAG-NEXT:    ldp x8, x9, [x1, #112]
 ; SDAG-NEXT:    mov x11, sp
 ; SDAG-NEXT:    ldp q0, q1, [x1]
@@ -3659,7 +3690,6 @@ define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsl x1, x27, x1
 ; SDAG-NEXT:    lsl x23, x23, x15
 ; SDAG-NEXT:    orr x5, x22, x5
-; SDAG-NEXT:    ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x19, x25, x19
 ; SDAG-NEXT:    lsl x25, x26, #1
 ; SDAG-NEXT:    orr x20, x23, x20
@@ -3676,32 +3706,34 @@ define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
 ; SDAG-NEXT:    lsl x1, x9, #1
 ; SDAG-NEXT:    stp x7, x26, [x0, #112]
 ; SDAG-NEXT:    lsl x7, x10, #1
+; SDAG-NEXT:    lsl x5, x12, #1
 ; SDAG-NEXT:    orr x3, x3, x21
 ; SDAG-NEXT:    orr x13, x19, x13
-; SDAG-NEXT:    lsl x5, x12, #1
 ; SDAG-NEXT:    lsr x9, x9, x8
 ; SDAG-NEXT:    stp x13, x3, [x0, #48]
 ; SDAG-NEXT:    lsl x13, x1, x15
 ; SDAG-NEXT:    lsr x23, x23, x8
+; SDAG-NEXT:    stp x4, x6, [x0, #64]
 ; SDAG-NEXT:    lsr x12, x12, x8
+; SDAG-NEXT:    lsl x4, x5, x15
 ; SDAG-NEXT:    lsr x8, x10, x8
 ; SDAG-NEXT:    lsl x10, x7, x15
-; SDAG-NEXT:    stp x4, x6, [x0, #64]
-; SDAG-NEXT:    lsl x4, x5, x15
 ; SDAG-NEXT:    orr x9, x9, x17
 ; SDAG-NEXT:    orr x11, x13, x11
-; SDAG-NEXT:    ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
-; SDAG-NEXT:    stp x11, x9, [x0, #16]
-; SDAG-NEXT:    orr x9, x10, x23
 ; SDAG-NEXT:    orr x12, x12, x2
-; SDAG-NEXT:    ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
 ; SDAG-NEXT:    orr x16, x4, x16
-; SDAG-NEXT:    ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
+; SDAG-NEXT:    stp x11, x9, [x0, #16]
 ; SDAG-NEXT:    orr x8, x8, x14
-; SDAG-NEXT:    ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
+; SDAG-NEXT:    orr x9, x10, x23
 ; SDAG-NEXT:    stp x16, x12, [x0, #32]
 ; SDAG-NEXT:    stp x9, x8, [x0]
-; SDAG-NEXT:    add sp, sp, #336
+; SDAG-NEXT:    sub sp, x29, #80
+; SDAG-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT:    ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: test_ashr_i1024:
diff --git a/llvm/test/CodeGen/AArch64/div-i256.ll b/llvm/test/CodeGen/AArch64/div-i256.ll
index 48ac1963f465a..c18346062edbf 100644
--- a/llvm/test/CodeGen/AArch64/div-i256.ll
+++ b/llvm/test/CodeGen/AArch64/div-i256.ll
@@ -3,198 +3,10 @@
 
 define i256 @udiv256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: udiv256:
-; CHECK:       // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    orr x8, x5, x7
-; CHECK-NEXT:    orr x9, x4, x6
-; CHECK-NEXT:    orr x10, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    orr x9, x1, x3
-; CHECK-NEXT:    clz x11, x5
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    orr x8, x10, x9
-; CHECK-NEXT:    clz x10, x7
-; CHECK-NEXT:    ccmp x8, #0, #4, ne
-; CHECK-NEXT:    clz x8, x6
-; CHECK-NEXT:    clz x12, x1
-; CHECK-NEXT:    cset w9, eq
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x7, #0
-; CHECK-NEXT:    csel x8, x10, x8, ne
-; CHECK-NEXT:    clz x10, x4
-; CHECK-NEXT:    cmp x5, #0
-; CHECK-NEXT:    add x10, x10, #64
-; CHECK-NEXT:    csel x10, x11, x10, ne
-; CHECK-NEXT:    orr x11, x6, x7
-; CHECK-NEXT:    add x10, x10, #128
-; CHECK-NEXT:    cmp x11, #0
-; CHECK-NEXT:    clz x11, x3
-; CHECK-NEXT:    csel x8, x8, x10, ne
-; CHECK-NEXT:    clz x10, x2
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    add x10, x10, #64
-; CHECK-NEXT:    csel x10, x11, x10, ne
-; CHECK-NEXT:    clz x11, x0
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    add x11, x11, #64
-; CHECK-NEXT:    csel x11, x12, x11, ne
-; CHECK-NEXT:    orr x12, x2, x3
-; CHECK-NEXT:    add x11, x11, #128
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    csel x10, x10, x11, ne
-; CHECK-NEXT:    subs x15, x8, x10
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    ngcs x13, xzr
-; CHECK-NEXT:    ngcs x14, xzr
-; CHECK-NEXT:    ngc x12, xzr
-; CHECK-NEXT:    cmp x8, x15
-; CHECK-NEXT:    ngcs xzr, x13
-; CHECK-NEXT:    ngcs xzr, x14
-; CHECK-NEXT:    ngcs xzr, x12
-; CHECK-NEXT:    csinc w16, w9, wzr, hs
-; CHECK-NEXT:    cmp w16, #0
-; CHECK-NEXT:    csel x8, xzr, x3, ne
-; CHECK-NEXT:    csel x9, xzr, x2, ne
-; CHECK-NEXT:    csel x10, xzr, x1, ne
-; CHECK-NEXT:    csel x11, xzr, x0, ne
-; CHECK-NEXT:    tbnz w16, #0, .LBB0_6
-; CHECK-NEXT:  // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT:    eor x16, x15, #0xff
-; CHECK-NEXT:    orr x17, x13, x12
-; CHECK-NEXT:    orr x16, x16, x14
-; CHECK-NEXT:    orr x16, x16, x17
-; CHECK-NEXT:    cbz x16, .LBB0_6
-; CHECK-NEXT:  // %bb.2: // %udiv-bb1
-; CHECK-NEXT:    sub sp, sp, #208
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    sub x11, x8, x15
-; CHECK-NEXT:    add x9, x9, #32
-; CHECK-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x8, x11, #3
-; CHECK-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT:    and x18, x11, #0x3f
-; CHECK-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    eor x18, x18, #0x3f
-; CHECK-NEXT:    and x8, x8, #0x18
-; CHECK-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT:    sub x16, x9, x8
-; CHECK-NEXT:    adds x8, x15, #1
-; CHECK-NEXT:    stp x0, x1, [sp, #96]
-; CHECK-NEXT:    adcs x9, x13, xzr
-; CHECK-NEXT:    mvn w19, w11
-; CHECK-NEXT:    stp x2, x3, [sp, #112]
-; CHECK-NEXT:    adcs x10, x14, xzr
-; CHECK-NEXT:    stp q0, q0, [sp, #64]
-; CHECK-NEXT:    ldp x15, x17, [x16, #8]
-; CHECK-NEXT:    ldr x13, [x16, #24]
-; CHECK-NEXT:    ldr x16, [x16]
-; CHECK-NEXT:    lsr x14, x15, #1
-; CHECK-NEXT:    lsr x20, x17, #1
-; CHECK-NEXT:    lsr x21, x16, #1
-; CHECK-NEXT:    lsl x17, x17, x11
-; CHECK-NEXT:    lsr x19, x14, x19
-; CHECK-NEXT:    adcs x14, x12, xzr
-; CHECK-NEXT:    lsl x12, x13, x11
-; CHECK-NEXT:    lsr x13, x20, x18
-; CHECK-NEXT:    lsl x20, x15, x11
-; CHECK-NEXT:    lsr x18, x21, x18
-; CHECK-NEXT:    lsl x11, x16, x11
-; CHECK-NEXT:    cset w21, hs
-; CHECK-NEXT:    mov x16, xzr
-; CHECK-NEXT:    orr x15, x12, x13
-; CHECK-NEXT:    orr x13, x17, x19
-; CHECK-NEXT:    orr x12, x20, x18
-; CHECK-NEXT:    tbnz w21, #0, .LBB0_5
-; CHECK-NEXT:  // %bb.3: // %udiv-preheader
-; CHECK-NEXT:    lsr x20, x8, #3
-; CHECK-NEXT:    stp x0, x1, [sp]
-; CHECK-NEXT:    mov x1, sp
-; CHECK-NEXT:    stp q0, q0, [sp, #32]
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    and x0, x20, #0x18
-; CHECK-NEXT:    stp x2, x3, [sp, #16]
-; CHECK-NEXT:    and x2, x8, #0x3f
-; CHECK-NEXT:    add x0, x1, x0
-; CHECK-NEXT:    eor x2, x2, #0x3f
-; CHECK-NEXT:    mvn w20, w8
-; CHECK-NEXT:    ldp x1, x3, [x0, #16]
-; CHECK-NEXT:    mov x17, xzr
-; CHECK-NEXT:    ldp x24, x21, [x0]
-; CHECK-NEXT:    subs x0, x4, #1
-; CHECK-NEXT:    lsl x22, x3, #1
-; CHECK-NEXT:    lsl x23, x1, #1
-; CHECK-NEXT:    lsr x25, x1, x8
-; CHECK-NEXT:    lsl x26, x21, #1
-; CHECK-NEXT:    mov x1, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x27, x24, x8
-; CHECK-NEXT:    lsl x22, x22, x2
-; CHECK-NEXT:    lsl x20, x23, x20
-; CHECK-NEXT:    lsr x23, x21, x8
-; CHECK-NEXT:    lsl x26, x26, x2
-; CHECK-NEXT:    adcs x2, x5, x1
-; CHECK-NEXT:    orr x21, x22, x25
-; CHECK-NEXT:    lsr x22, x3, x8
-; CHECK-NEXT:    adcs x3, x6, x1
-; CHECK-NEXT:    orr x24, x23, x20
-; CHECK-NEXT:    orr x23, x26, x27
-; CHECK-NEXT:    adc x20, x7, x1
-; CHECK-NEXT:  .LBB0_4: // %udiv-do-while
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    extr x25, x23, x15, #63
-; CHECK-NEXT:    extr x26, x24, x23, #63
-; CHECK-NEXT:    extr x27, x21, x24, #63
-; CHECK-NEXT:    extr x22, x22, x21, #63
-; CHECK-NEXT:    extr x15, x15, x13, #63
-; CHECK-NEXT:    extr x13, x13, x12, #63
-; CHECK-NEXT:    cmp x0, x25
-; CHECK-NEXT:    sbcs xzr, x2, x26
-; CHECK-NEXT:    orr x13, x19, x13
-; CHECK-NEXT:    orr x15, x17, x15
-; CHECK-NEXT:    sbcs xzr, x3, x27
-; CHECK-NEXT:    mov x17, xzr
-; CHECK-NEXT:    sbc x21, x20, x22
-; CHECK-NEXT:    asr x28, x21, #63
-; CHECK-NEXT:    and x21, x28, x4
-; CHECK-NEXT:    subs x23, x25, x21
-; CHECK-NEXT:    and x21, x28, x5
-; CHECK-NEXT:    and x25, x28, x7
-; CHECK-NEXT:    sbcs x24, x26, x21
-; CHECK-NEXT:    and x21, x28, x6
-; CHECK-NEXT:    sbcs x21, x27, x21
-; CHECK-NEXT:    sbc x22, x22, x25
-; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    extr x25, x12, x11, #63
-; CHECK-NEXT:    adcs x9, x9, x1
-; CHECK-NEXT:    orr x11, x16, x11, lsl #1
-; CHECK-NEXT:    and x16, x28, #0x1
-; CHECK-NEXT:    adcs x10, x10, x1
-; CHECK-NEXT:    orr x12, x18, x25
-; CHECK-NEXT:    adc x14, x14, x1
-; CHECK-NEXT:    orr x19, x8, x10
-; CHECK-NEXT:    orr x18, x9, x14
-; CHECK-NEXT:    orr x25, x19, x18
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    cbnz x25, .LBB0_4
-; CHECK-NEXT:  .LBB0_5: // %udiv-loop-exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x10, x12, x11, #63
-; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x9, x13, x12, #63
-; CHECK-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x8, x15, x13, #63
-; CHECK-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT:    orr x11, x16, x11, lsl #1
-; CHECK-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #208
-; CHECK-NEXT:  .LBB0_6: // %udiv-end
-; CHECK-NEXT:    mov x0, x11
-; CHECK-NEXT:    mov x1, x10
-; CHECK-NEXT:    mov x2, x9
-; CHECK-NEXT:    mov x3, x8
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __udivoi3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = udiv i256 %a, %b
   ret i256 %r
@@ -202,223 +14,10 @@ define i256 @udiv256(i256 %a, i256 %b) nounwind {
 
 define i256 @sdiv256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: sdiv256:
-; CHECK:       // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    asr x12, x3, #63
-; CHECK-NEXT:    asr x13, x7, #63
-; CHECK-NEXT:    eor x8, x0, x12
-; CHECK-NEXT:    eor x9, x1, x12
-; CHECK-NEXT:    eor x10, x4, x13
-; CHECK-NEXT:    subs x14, x8, x12
-; CHECK-NEXT:    eor x8, x2, x12
-; CHECK-NEXT:    eor x11, x5, x13
-; CHECK-NEXT:    sbcs x15, x9, x12
-; CHECK-NEXT:    eor x9, x3, x12
-; CHECK-NEXT:    clz x16, x14
-; CHECK-NEXT:    sbcs x18, x8, x12
-; CHECK-NEXT:    clz x17, x15
-; CHECK-NEXT:    add x16, x16, #64
-; CHECK-NEXT:    sbc x0, x9, x12
-; CHECK-NEXT:    subs x8, x10, x13
-; CHECK-NEXT:    eor x10, x6, x13
-; CHECK-NEXT:    sbcs x9, x11, x13
-; CHECK-NEXT:    eor x11, x7, x13
-; CHECK-NEXT:    orr x1, x14, x18
-; CHECK-NEXT:    sbcs x10, x10, x13
-; CHECK-NEXT:    orr x4, x15, x0
-; CHECK-NEXT:    clz x5, x8
-; CHECK-NEXT:    sbc x11, x11, x13
-; CHECK-NEXT:    orr x2, x8, x10
-; CHECK-NEXT:    orr x1, x1, x4
-; CHECK-NEXT:    orr x3, x9, x11
-; CHECK-NEXT:    add x5, x5, #64
-; CHECK-NEXT:    orr x6, x10, x11
-; CHECK-NEXT:    orr x2, x2, x3
-; CHECK-NEXT:    clz x3, x10
-; CHECK-NEXT:    eor x12, x13, x12
-; CHECK-NEXT:    cmp x2, #0
-; CHECK-NEXT:    add x3, x3, #64
-; CHECK-NEXT:    clz x2, x9
-; CHECK-NEXT:    ccmp x1, #0, #4, ne
-; CHECK-NEXT:    clz x1, x11
-; CHECK-NEXT:    cset w4, eq
-; CHECK-NEXT:    cmp x11, #0
-; CHECK-NEXT:    csel x1, x1, x3, ne
-; CHECK-NEXT:    cmp x9, #0
-; CHECK-NEXT:    clz x3, x18
-; CHECK-NEXT:    csel x2, x2, x5, ne
-; CHECK-NEXT:    cmp x6, #0
-; CHECK-NEXT:    add x3, x3, #64
-; CHECK-NEXT:    add x2, x2, #128
-; CHECK-NEXT:    clz x5, x0
-; CHECK-NEXT:    csel x1, x1, x2, ne
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    csel x2, x5, x3, ne
-; CHECK-NEXT:    cmp x15, #0
-; CHECK-NEXT:    orr x3, x18, x0
-; CHECK-NEXT:    csel x16, x17, x16, ne
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    mov w3, #255 // =0xff
-; CHECK-NEXT:    add x16, x16, #128
-; CHECK-NEXT:    csel x16, x2, x16, ne
-; CHECK-NEXT:    subs x2, x1, x16
-; CHECK-NEXT:    ngcs x16, xzr
-; CHECK-NEXT:    ngcs x17, xzr
-; CHECK-NEXT:    ngc x1, xzr
-; CHECK-NEXT:    cmp x3, x2
-; CHECK-NEXT:    ngcs xzr, x16
-; CHECK-NEXT:    ngcs xzr, x17
-; CHECK-NEXT:    ngcs xzr, x1
-; CHECK-NEXT:    csinc w5, w4, wzr, hs
-; CHECK-NEXT:    cmp w5, #0
-; CHECK-NEXT:    csel x13, xzr, x0, ne
-; CHECK-NEXT:    csel x4, xzr, x18, ne
-; CHECK-NEXT:    csel x7, xzr, x15, ne
-; CHECK-NEXT:    csel x3, xzr, x14, ne
-; CHECK-NEXT:    tbnz w5, #0, .LBB1_6
-; CHECK-NEXT:  // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT:    eor x5, x2, #0xff
-; CHECK-NEXT:    orr x6, x16, x1
-; CHECK-NEXT:    orr x5, x5, x17
-; CHECK-NEXT:    orr x5, x5, x6
-; CHECK-NEXT:    cbz x5, .LBB1_6
-; CHECK-NEXT:  // %bb.2: // %udiv-bb1
-; CHECK-NEXT:    sub sp, sp, #224
-; CHECK-NEXT:    mov w13, #255 // =0xff
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    add x4, sp, #64
-; CHECK-NEXT:    sub x3, x13, x2
-; CHECK-NEXT:    add x4, x4, #32
-; CHECK-NEXT:    stp x0, x29, [sp, #120] // 8-byte Folded Spill
-; CHECK-NEXT:    lsr x13, x3, #3
-; CHECK-NEXT:    stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    and x13, x13, #0x18
-; CHECK-NEXT:    stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT:    sub x4, x4, x13
-; CHECK-NEXT:    adds x13, x2, #1
-; CHECK-NEXT:    stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT:    and x19, x3, #0x3f
-; CHECK-NEXT:    adcs x16, x16, xzr
-; CHECK-NEXT:    stp x14, x15, [sp, #96]
-; CHECK-NEXT:    mvn w20, w3
-; CHECK-NEXT:    eor x19, x19, #0x3f
-; CHECK-NEXT:    str x18, [sp, #112]
-; CHECK-NEXT:    adcs x17, x17, xzr
-; CHECK-NEXT:    stp q0, q0, [sp, #64]
-; CHECK-NEXT:    ldp x2, x6, [x4, #8]
-; CHECK-NEXT:    ldr x7, [x4]
-; CHECK-NEXT:    ldr x5, [x4, #24]
-; CHECK-NEXT:    lsr x22, x7, #1
-; CHECK-NEXT:    lsr x4, x2, #1
-; CHECK-NEXT:    lsr x21, x6, #1
-; CHECK-NEXT:    lsl x5, x5, x3
-; CHECK-NEXT:    lsl x6, x6, x3
-; CHECK-NEXT:    lsl x2, x2, x3
-; CHECK-NEXT:    lsr x20, x4, x20
-; CHECK-NEXT:    lsr x21, x21, x19
-; CHECK-NEXT:    lsr x19, x22, x19
-; CHECK-NEXT:    adcs x4, x1, xzr
-; CHECK-NEXT:    lsl x1, x7, x3
-; CHECK-NEXT:    cset w22, hs
-; CHECK-NEXT:    orr x5, x5, x21
-; CHECK-NEXT:    orr x3, x6, x20
-; CHECK-NEXT:    orr x2, x2, x19
-; CHECK-NEXT:    mov x6, xzr
-; CHECK-NEXT:    tbnz w22, #0, .LBB1_5
-; CHECK-NEXT:  // %bb.3: // %udiv-preheader
-; CHECK-NEXT:    lsr x21, x13, #3
-; CHECK-NEXT:    stp x14, x15, [sp]
-; CHECK-NEXT:    mov x15, sp
-; CHECK-NEXT:    stp q0, q0, [sp, #32]
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    and x14, x21, #0x18
-; CHECK-NEXT:    stp x18, x0, [sp, #16]
-; CHECK-NEXT:    and x18, x13, #0x3f
-; CHECK-NEXT:    add x14, x15, x14
-; CHECK-NEXT:    eor x18, x18, #0x3f
-; CHECK-NEXT:    mvn w21, w13
-; CHECK-NEXT:    ldp x15, x0, [x14, #16]
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    ldp x25, x22, [x14]
-; CHECK-NEXT:    subs x14, x8, #1
-; CHECK-NEXT:    lsl x23, x0, #1
-; CHECK-NEXT:    lsl x24, x15, #1
-; CHECK-NEXT:    lsr x26, x15, x13
-; CHECK-NEXT:    lsl x27, x22, #1
-; CHECK-NEXT:    mov x15, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x28, x25, x13
-; CHECK-NEXT:    lsl x23, x23, x18
-; CHECK-NEXT:    lsl x21, x24, x21
-; CHECK-NEXT:    lsr x24, x22, x13
-; CHECK-NEXT:    lsl x27, x27, x18
-; CHECK-NEXT:    adcs x18, x9, x15
-; CHECK-NEXT:    orr x22, x23, x26
-; CHECK-NEXT:    lsr x23, x0, x13
-; CHECK-NEXT:    adcs x0, x10, x15
-; CHECK-NEXT:    orr x25, x24, x21
-; CHECK-NEXT:    orr x24, x27, x28
-; CHECK-NEXT:    adc x21, x11, x15
-; CHECK-NEXT:  .LBB1_4: // %udiv-do-while
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    extr x26, x24, x5, #63
-; CHECK-NEXT:    extr x27, x25, x24, #63
-; CHECK-NEXT:    extr x28, x22, x25, #63
-; CHECK-NEXT:    extr x23, x23, x22, #63
-; CHECK-NEXT:    extr x5, x5, x3, #63
-; CHECK-NEXT:    extr x3, x3, x2, #63
-; CHECK-NEXT:    cmp x14, x26
-; CHECK-NEXT:    sbcs xzr, x18, x27
-; CHECK-NEXT:    orr x3, x20, x3
-; CHECK-NEXT:    orr x5, x7, x5
-; CHECK-NEXT:    sbcs xzr, x0, x28
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    sbc x22, x21, x23
-; CHECK-NEXT:    asr x29, x22, #63
-; CHECK-NEXT:    and x22, x29, x8
-; CHECK-NEXT:    subs x24, x26, x22
-; CHECK-NEXT:    and x22, x29, x9
-; CHECK-NEXT:    and x26, x29, x11
-; CHECK-NEXT:    sbcs x25, x27, x22
-; CHECK-NEXT:    and x22, x29, x10
-; CHECK-NEXT:    sbcs x22, x28, x22
-; CHECK-NEXT:    sbc x23, x23, x26
-; CHECK-NEXT:    subs x13, x13, #1
-; CHECK-NEXT:    extr x26, x2, x1, #63
-; CHECK-NEXT:    adcs x16, x16, x15
-; CHECK-NEXT:    orr x1, x6, x1, lsl #1
-; CHECK-NEXT:    and x6, x29, #0x1
-; CHECK-NEXT:    adcs x17, x17, x15
-; CHECK-NEXT:    orr x2, x19, x26
-; CHECK-NEXT:    adc x4, x4, x15
-; CHECK-NEXT:    orr x20, x13, x17
-; CHECK-NEXT:    orr x19, x16, x4
-; CHECK-NEXT:    orr x26, x20, x19
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    cbnz x26, .LBB1_4
-; CHECK-NEXT:  .LBB1_5: // %udiv-loop-exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x7, x2, x1, #63
-; CHECK-NEXT:    ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x4, x3, x2, #63
-; CHECK-NEXT:    ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x13, x5, x3, #63
-; CHECK-NEXT:    ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    orr x3, x6, x1, lsl #1
-; CHECK-NEXT:    ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #128] // 8-byte Reload
-; CHECK-NEXT:    add sp, sp, #224
-; CHECK-NEXT:  .LBB1_6: // %udiv-end
-; CHECK-NEXT:    eor x8, x3, x12
-; CHECK-NEXT:    eor x9, x7, x12
-; CHECK-NEXT:    subs x0, x8, x12
-; CHECK-NEXT:    eor x8, x4, x12
-; CHECK-NEXT:    sbcs x1, x9, x12
-; CHECK-NEXT:    eor x9, x13, x12
-; CHECK-NEXT:    sbcs x2, x8, x12
-; CHECK-NEXT:    sbc x3, x9, x12
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __divoi3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = sdiv i256 %a, %b
   ret i256 %r
@@ -426,232 +25,10 @@ define i256 @sdiv256(i256 %a, i256 %b) nounwind {
 
 define i256 @urem256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: urem256:
-; CHECK:       // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    orr x8, x5, x7
-; CHECK-NEXT:    orr x9, x4, x6
-; CHECK-NEXT:    orr x10, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    orr x9, x1, x3
-; CHECK-NEXT:    clz x11, x1
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    orr x8, x10, x9
-; CHECK-NEXT:    clz x9, x7
-; CHECK-NEXT:    ccmp x8, #0, #4, ne
-; CHECK-NEXT:    clz x8, x6
-; CHECK-NEXT:    clz x10, x5
-; CHECK-NEXT:    cset w13, eq
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x7, #0
-; CHECK-NEXT:    csel x8, x9, x8, ne
-; CHECK-NEXT:    clz x9, x4
-; CHECK-NEXT:    cmp x5, #0
-; CHECK-NEXT:    add x9, x9, #64
-; CHECK-NEXT:    csel x9, x10, x9, ne
-; CHECK-NEXT:    orr x10, x6, x7
-; CHECK-NEXT:    add x9, x9, #128
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    clz x10, x3
-; CHECK-NEXT:    csel x8, x8, x9, ne
-; CHECK-NEXT:    clz x9, x2
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    add x9, x9, #64
-; CHECK-NEXT:    csel x9, x10, x9, ne
-; CHECK-NEXT:    clz x10, x0
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    add x10, x10, #64
-; CHECK-NEXT:    csel x10, x11, x10, ne
-; CHECK-NEXT:    orr x11, x2, x3
-; CHECK-NEXT:    add x10, x10, #128
-; CHECK-NEXT:    cmp x11, #0
-; CHECK-NEXT:    csel x9, x9, x10, ne
-; CHECK-NEXT:    subs x12, x8, x9
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    ngcs x9, xzr
-; CHECK-NEXT:    ngcs x10, xzr
-; CHECK-NEXT:    ngc x11, xzr
-; CHECK-NEXT:    cmp x8, x12
-; CHECK-NEXT:    ngcs xzr, x9
-; CHECK-NEXT:    ngcs xzr, x10
-; CHECK-NEXT:    ngcs xzr, x11
-; CHECK-NEXT:    csinc w15, w13, wzr, hs
-; CHECK-NEXT:    cmp w15, #0
-; CHECK-NEXT:    csel x13, xzr, x3, ne
-; CHECK-NEXT:    csel x17, xzr, x2, ne
-; CHECK-NEXT:    csel x14, xzr, x1, ne
-; CHECK-NEXT:    csel x8, xzr, x0, ne
-; CHECK-NEXT:    tbnz w15, #0, .LBB2_6
-; CHECK-NEXT:  // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT:    eor x15, x12, #0xff
-; CHECK-NEXT:    orr x16, x9, x11
-; CHECK-NEXT:    orr x15, x15, x10
-; CHECK-NEXT:    orr x15, x15, x16
-; CHECK-NEXT:    cbz x15, .LBB2_6
-; CHECK-NEXT:  // %bb.2: // %udiv-bb1
-; CHECK-NEXT:    sub sp, sp, #256
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    add x14, sp, #96
-; CHECK-NEXT:    sub x13, x8, x12
-; CHECK-NEXT:    add x14, x14, #32
-; CHECK-NEXT:    stp x29, x30, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x8, x13, #3
-; CHECK-NEXT:    stp x28, x27, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT:    and x18, x13, #0x3f
-; CHECK-NEXT:    stp x26, x25, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT:    eor x18, x18, #0x3f
-; CHECK-NEXT:    and x8, x8, #0x18
-; CHECK-NEXT:    stp x24, x23, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #224] // 16-byte Folded Spill
-; CHECK-NEXT:    sub x14, x14, x8
-; CHECK-NEXT:    adds x8, x12, #1
-; CHECK-NEXT:    stp x20, x19, [sp, #240] // 16-byte Folded Spill
-; CHECK-NEXT:    mvn w19, w13
-; CHECK-NEXT:    adcs x9, x9, xzr
-; CHECK-NEXT:    stp x0, x1, [sp, #128]
-; CHECK-NEXT:    adcs x10, x10, xzr
-; CHECK-NEXT:    stp x2, x3, [sp, #144]
-; CHECK-NEXT:    stp q0, q0, [sp, #96]
-; CHECK-NEXT:    ldp x12, x16, [x14, #8]
-; CHECK-NEXT:    ldr x15, [x14, #24]
-; CHECK-NEXT:    ldr x17, [x14]
-; CHECK-NEXT:    stp x1, x2, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsl x15, x15, x13
-; CHECK-NEXT:    str x0, [sp, #8] // 8-byte Spill
-; CHECK-NEXT:    lsr x14, x12, #1
-; CHECK-NEXT:    lsr x20, x16, #1
-; CHECK-NEXT:    lsl x16, x16, x13
-; CHECK-NEXT:    lsl x12, x12, x13
-; CHECK-NEXT:    lsr x21, x14, x19
-; CHECK-NEXT:    lsr x19, x17, #1
-; CHECK-NEXT:    lsr x20, x20, x18
-; CHECK-NEXT:    adcs x14, x11, xzr
-; CHECK-NEXT:    lsl x11, x17, x13
-; CHECK-NEXT:    lsr x18, x19, x18
-; CHECK-NEXT:    cset w19, hs
-; CHECK-NEXT:    orr x15, x15, x20
-; CHECK-NEXT:    orr x13, x16, x21
-; CHECK-NEXT:    mov x16, xzr
-; CHECK-NEXT:    orr x12, x12, x18
-; CHECK-NEXT:    tbnz w19, #0, .LBB2_5
-; CHECK-NEXT:  // %bb.3: // %udiv-preheader
-; CHECK-NEXT:    lsr x20, x8, #3
-; CHECK-NEXT:    stp x0, x1, [sp, #32]
-; CHECK-NEXT:    add x1, sp, #32
-; CHECK-NEXT:    stp q0, q0, [sp, #64]
-; CHECK-NEXT:    and x21, x8, #0x3f
-; CHECK-NEXT:    mvn w22, w8
-; CHECK-NEXT:    and x0, x20, #0x18
-; CHECK-NEXT:    stp x2, x3, [sp, #48]
-; CHECK-NEXT:    eor x26, x21, #0x3f
-; CHECK-NEXT:    add x0, x1, x0
-; CHECK-NEXT:    mov x21, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    ldp x2, x20, [x0, #16]
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    ldp x0, x23, [x0]
-; CHECK-NEXT:    mov x17, xzr
-; CHECK-NEXT:    lsl x24, x20, #1
-; CHECK-NEXT:    lsl x25, x2, #1
-; CHECK-NEXT:    lsr x27, x2, x8
-; CHECK-NEXT:    lsl x28, x23, #1
-; CHECK-NEXT:    subs x2, x4, #1
-; CHECK-NEXT:    lsr x30, x23, x8
-; CHECK-NEXT:    lsl x24, x24, x26
-; CHECK-NEXT:    lsl x29, x25, x22
-; CHECK-NEXT:    lsr x1, x0, x8
-; CHECK-NEXT:    lsl x0, x28, x26
-; CHECK-NEXT:    adcs x22, x5, x21
-; CHECK-NEXT:    lsr x26, x20, x8
-; CHECK-NEXT:    adcs x23, x6, x21
-; CHECK-NEXT:    orr x25, x24, x27
-; CHECK-NEXT:    orr x28, x30, x29
-; CHECK-NEXT:    orr x27, x0, x1
-; CHECK-NEXT:    adc x24, x7, x21
-; CHECK-NEXT:  .LBB2_4: // %udiv-do-while
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    extr x29, x27, x15, #63
-; CHECK-NEXT:    extr x30, x28, x27, #63
-; CHECK-NEXT:    extr x20, x25, x28, #63
-; CHECK-NEXT:    extr x26, x26, x25, #63
-; CHECK-NEXT:    extr x15, x15, x13, #63
-; CHECK-NEXT:    extr x13, x13, x12, #63
-; CHECK-NEXT:    cmp x2, x29
-; CHECK-NEXT:    sbcs xzr, x22, x30
-; CHECK-NEXT:    orr x13, x19, x13
-; CHECK-NEXT:    orr x15, x17, x15
-; CHECK-NEXT:    sbcs xzr, x23, x20
-; CHECK-NEXT:    mov x17, xzr
-; CHECK-NEXT:    sbc x25, x24, x26
-; CHECK-NEXT:    asr x0, x25, #63
-; CHECK-NEXT:    and x25, x0, x4
-; CHECK-NEXT:    subs x27, x29, x25
-; CHECK-NEXT:    and x25, x0, x5
-; CHECK-NEXT:    sbcs x28, x30, x25
-; CHECK-NEXT:    and x25, x0, x6
-; CHECK-NEXT:    sbcs x25, x20, x25
-; CHECK-NEXT:    and x20, x0, x7
-; CHECK-NEXT:    sbc x26, x26, x20
-; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    extr x20, x12, x11, #63
-; CHECK-NEXT:    adcs x9, x9, x21
-; CHECK-NEXT:    orr x11, x16, x11, lsl #1
-; CHECK-NEXT:    and x16, x0, #0x1
-; CHECK-NEXT:    adcs x10, x10, x21
-; CHECK-NEXT:    orr x12, x18, x20
-; CHECK-NEXT:    adc x14, x14, x21
-; CHECK-NEXT:    orr x19, x8, x10
-; CHECK-NEXT:    orr x18, x9, x14
-; CHECK-NEXT:    orr x0, x19, x18
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    cbnz x0, .LBB2_4
-; CHECK-NEXT:  .LBB2_5: // %udiv-loop-exit
-; CHECK-NEXT:    ldp x1, x2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x14, x12, x11, #63
-; CHECK-NEXT:    ldp x20, x19, [sp, #240] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x17, x13, x12, #63
-; CHECK-NEXT:    ldp x22, x21, [sp, #224] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x13, x15, x13, #63
-; CHECK-NEXT:    ldp x24, x23, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT:    orr x8, x16, x11, lsl #1
-; CHECK-NEXT:    ldp x26, x25, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x0, [sp, #8] // 8-byte Reload
-; CHECK-NEXT:    ldp x28, x27, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #256
-; CHECK-NEXT:  .LBB2_6: // %udiv-end
-; CHECK-NEXT:    umulh x9, x17, x4
-; CHECK-NEXT:    umulh x10, x8, x6
-; CHECK-NEXT:    madd x9, x17, x5, x9
-; CHECK-NEXT:    madd x10, x8, x7, x10
-; CHECK-NEXT:    mul x11, x8, x6
-; CHECK-NEXT:    mul x12, x17, x4
-; CHECK-NEXT:    madd x9, x13, x4, x9
-; CHECK-NEXT:    madd x10, x14, x6, x10
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    umulh x15, x4, x8
-; CHECK-NEXT:    mul x16, x5, x8
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    umulh x13, x5, x8
-; CHECK-NEXT:    mul x18, x4, x14
-; CHECK-NEXT:    adds x10, x16, x15
-; CHECK-NEXT:    umulh x17, x4, x14
-; CHECK-NEXT:    cinc x13, x13, hs
-; CHECK-NEXT:    mul x12, x5, x14
-; CHECK-NEXT:    adds x10, x18, x10
-; CHECK-NEXT:    umulh x6, x5, x14
-; CHECK-NEXT:    cinc x14, x17, hs
-; CHECK-NEXT:    mul x8, x4, x8
-; CHECK-NEXT:    adds x13, x13, x14
-; CHECK-NEXT:    cset w14, hs
-; CHECK-NEXT:    adds x12, x12, x13
-; CHECK-NEXT:    adc x13, x6, x14
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    adc x9, x13, x9
-; CHECK-NEXT:    subs x0, x0, x8
-; CHECK-NEXT:    sbcs x1, x1, x10
-; CHECK-NEXT:    sbcs x2, x2, x11
-; CHECK-NEXT:    sbc x3, x3, x9
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __umodoi3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = urem i256 %a, %b
   ret i256 %r
@@ -659,258 +36,10 @@ define i256 @urem256(i256 %a, i256 %b) nounwind {
 
 define i256 @srem256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: srem256:
-; CHECK:       // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    sub sp, sp, #256
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    stp x22, x21, [sp, #224] // 16-byte Folded Spill
-; CHECK-NEXT:    asr x16, x7, #63
-; CHECK-NEXT:    stp x24, x23, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x10, x1, x8
-; CHECK-NEXT:    eor x11, x2, x8
-; CHECK-NEXT:    subs x21, x9, x8
-; CHECK-NEXT:    eor x13, x3, x8
-; CHECK-NEXT:    eor x14, x4, x16
-; CHECK-NEXT:    sbcs x22, x10, x8
-; CHECK-NEXT:    eor x15, x5, x16
-; CHECK-NEXT:    eor x17, x6, x16
-; CHECK-NEXT:    sbcs x23, x11, x8
-; CHECK-NEXT:    eor x18, x7, x16
-; CHECK-NEXT:    clz x0, x22
-; CHECK-NEXT:    sbc x11, x13, x8
-; CHECK-NEXT:    subs x13, x14, x16
-; CHECK-NEXT:    stp x29, x30, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    sbcs x14, x15, x16
-; CHECK-NEXT:    orr x3, x22, x11
-; CHECK-NEXT:    clz x4, x13
-; CHECK-NEXT:    sbcs x15, x17, x16
-; CHECK-NEXT:    add x4, x4, #64
-; CHECK-NEXT:    clz x17, x21
-; CHECK-NEXT:    sbc x16, x18, x16
-; CHECK-NEXT:    orr x1, x13, x15
-; CHECK-NEXT:    orr x18, x21, x23
-; CHECK-NEXT:    orr x2, x14, x16
-; CHECK-NEXT:    orr x18, x18, x3
-; CHECK-NEXT:    orr x5, x15, x16
-; CHECK-NEXT:    orr x1, x1, x2
-; CHECK-NEXT:    clz x2, x15
-; CHECK-NEXT:    add x17, x17, #64
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    add x2, x2, #64
-; CHECK-NEXT:    clz x1, x14
-; CHECK-NEXT:    ccmp x18, #0, #4, ne
-; CHECK-NEXT:    clz x18, x16
-; CHECK-NEXT:    stp x28, x27, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT:    cset w3, eq
-; CHECK-NEXT:    cmp x16, #0
-; CHECK-NEXT:    stp x26, x25, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT:    csel x18, x18, x2, ne
-; CHECK-NEXT:    cmp x14, #0
-; CHECK-NEXT:    clz x2, x23
-; CHECK-NEXT:    csel x1, x1, x4, ne
-; CHECK-NEXT:    cmp x5, #0
-; CHECK-NEXT:    add x2, x2, #64
-; CHECK-NEXT:    add x1, x1, #128
-; CHECK-NEXT:    clz x4, x11
-; CHECK-NEXT:    stp x20, x19, [sp, #240] // 16-byte Folded Spill
-; CHECK-NEXT:    csel x18, x18, x1, ne
-; CHECK-NEXT:    cmp x11, #0
-; CHECK-NEXT:    csel x1, x4, x2, ne
-; CHECK-NEXT:    cmp x22, #0
-; CHECK-NEXT:    orr x2, x23, x11
-; CHECK-NEXT:    csel x17, x0, x17, ne
-; CHECK-NEXT:    cmp x2, #0
-; CHECK-NEXT:    add x17, x17, #128
-; CHECK-NEXT:    csel x17, x1, x17, ne
-; CHECK-NEXT:    subs x2, x18, x17
-; CHECK-NEXT:    mov w17, #255 // =0xff
-; CHECK-NEXT:    ngcs x18, xzr
-; CHECK-NEXT:    ngcs x0, xzr
-; CHECK-NEXT:    ngc x1, xzr
-; CHECK-NEXT:    cmp x17, x2
-; CHECK-NEXT:    ngcs xzr, x18
-; CHECK-NEXT:    ngcs xzr, x0
-; CHECK-NEXT:    ngcs xzr, x1
-; CHECK-NEXT:    csinc w5, w3, wzr, hs
-; CHECK-NEXT:    cmp w5, #0
-; CHECK-NEXT:    csel x3, xzr, x11, ne
-; CHECK-NEXT:    csel x7, xzr, x23, ne
-; CHECK-NEXT:    csel x4, xzr, x22, ne
-; CHECK-NEXT:    csel x17, xzr, x21, ne
-; CHECK-NEXT:    tbnz w5, #0, .LBB3_6
-; CHECK-NEXT:  // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT:    eor x5, x2, #0xff
-; CHECK-NEXT:    orr x6, x18, x1
-; CHECK-NEXT:    orr x5, x5, x0
-; CHECK-NEXT:    orr x5, x5, x6
-; CHECK-NEXT:    cbz x5, .LBB3_6
-; CHECK-NEXT:  // %bb.2: // %udiv-bb1
-; CHECK-NEXT:    mov w9, #255 // =0xff
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    add x12, sp, #96
-; CHECK-NEXT:    sub x9, x9, x2
-; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    stp x21, x22, [sp, #128]
-; CHECK-NEXT:    lsr x10, x9, #3
-; CHECK-NEXT:    stp x23, x11, [sp, #144]
-; CHECK-NEXT:    adds x17, x2, #1
-; CHECK-NEXT:    and x5, x9, #0x3f
-; CHECK-NEXT:    adcs x18, x18, xzr
-; CHECK-NEXT:    mvn w6, w9
-; CHECK-NEXT:    and x10, x10, #0x18
-; CHECK-NEXT:    stp q0, q0, [sp, #96]
-; CHECK-NEXT:    eor x5, x5, #0x3f
-; CHECK-NEXT:    sub x10, x12, x10
-; CHECK-NEXT:    adcs x0, x0, xzr
-; CHECK-NEXT:    ldp x12, x3, [x10, #8]
-; CHECK-NEXT:    ldr x2, [x10, #24]
-; CHECK-NEXT:    ldr x10, [x10]
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsl x2, x2, x9
-; CHECK-NEXT:    str x23, [sp, #8] // 8-byte Spill
-; CHECK-NEXT:    lsr x4, x12, #1
-; CHECK-NEXT:    lsr x7, x3, #1
-; CHECK-NEXT:    lsr x19, x10, #1
-; CHECK-NEXT:    lsl x3, x3, x9
-; CHECK-NEXT:    lsl x12, x12, x9
-; CHECK-NEXT:    lsr x6, x4, x6
-; CHECK-NEXT:    lsr x7, x7, x5
-; CHECK-NEXT:    lsr x19, x19, x5
-; CHECK-NEXT:    adcs x4, x1, xzr
-; CHECK-NEXT:    lsl x1, x10, x9
-; CHECK-NEXT:    cset w20, hs
-; CHECK-NEXT:    orr x5, x2, x7
-; CHECK-NEXT:    orr x3, x3, x6
-; CHECK-NEXT:    orr x2, x12, x19
-; CHECK-NEXT:    mov x6, xzr
-; CHECK-NEXT:    tbnz w20, #0, .LBB3_5
-; CHECK-NEXT:  // %bb.3: // %udiv-preheader
-; CHECK-NEXT:    lsr x9, x17, #3
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    stp q0, q0, [sp, #64]
-; CHECK-NEXT:    stp x21, x22, [sp, #32]
-; CHECK-NEXT:    and x21, x17, #0x3f
-; CHECK-NEXT:    mvn w22, w17
-; CHECK-NEXT:    and x9, x9, #0x18
-; CHECK-NEXT:    stp x23, x11, [sp, #48]
-; CHECK-NEXT:    eor x26, x21, #0x3f
-; CHECK-NEXT:    add x9, x10, x9
-; CHECK-NEXT:    subs x21, x13, #1
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    ldp x10, x12, [x9, #16]
-; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    ldp x9, x23, [x9]
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    lsl x24, x12, #1
-; CHECK-NEXT:    lsl x25, x10, #1
-; CHECK-NEXT:    lsr x10, x10, x17
-; CHECK-NEXT:    lsl x27, x23, #1
-; CHECK-NEXT:    lsr x28, x23, x17
-; CHECK-NEXT:    lsr x9, x9, x17
-; CHECK-NEXT:    lsl x24, x24, x26
-; CHECK-NEXT:    lsl x25, x25, x22
-; CHECK-NEXT:    mov x22, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsl x30, x27, x26
-; CHECK-NEXT:    adcs x23, x14, x22
-; CHECK-NEXT:    lsr x27, x12, x17
-; CHECK-NEXT:    orr x26, x24, x10
-; CHECK-NEXT:    adcs x24, x15, x22
-; CHECK-NEXT:    orr x29, x28, x25
-; CHECK-NEXT:    orr x28, x30, x9
-; CHECK-NEXT:    adc x25, x16, x22
-; CHECK-NEXT:  .LBB3_4: // %udiv-do-while
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    extr x30, x28, x5, #63
-; CHECK-NEXT:    extr x9, x29, x28, #63
-; CHECK-NEXT:    extr x10, x26, x29, #63
-; CHECK-NEXT:    extr x27, x27, x26, #63
-; CHECK-NEXT:    cmp x21, x30
-; CHECK-NEXT:    sbcs xzr, x23, x9
-; CHECK-NEXT:    sbcs xzr, x24, x10
-; CHECK-NEXT:    sbc x26, x25, x27
-; CHECK-NEXT:    asr x12, x26, #63
-; CHECK-NEXT:    and x26, x12, x13
-; CHECK-NEXT:    subs x28, x30, x26
-; CHECK-NEXT:    and x26, x12, x14
-; CHECK-NEXT:    sbcs x29, x9, x26
-; CHECK-NEXT:    and x9, x12, x15
-; CHECK-NEXT:    sbcs x26, x10, x9
-; CHECK-NEXT:    and x9, x12, x16
-; CHECK-NEXT:    extr x10, x5, x3, #63
-; CHECK-NEXT:    sbc x27, x27, x9
-; CHECK-NEXT:    subs x17, x17, #1
-; CHECK-NEXT:    extr x9, x2, x1, #63
-; CHECK-NEXT:    adcs x18, x18, x22
-; CHECK-NEXT:    extr x3, x3, x2, #63
-; CHECK-NEXT:    orr x1, x6, x1, lsl #1
-; CHECK-NEXT:    adcs x0, x0, x22
-; CHECK-NEXT:    orr x2, x19, x9
-; CHECK-NEXT:    orr x5, x7, x10
-; CHECK-NEXT:    adc x4, x4, x22
-; CHECK-NEXT:    orr x19, x17, x0
-; CHECK-NEXT:    orr x3, x20, x3
-; CHECK-NEXT:    orr x9, x18, x4
-; CHECK-NEXT:    and x6, x12, #0x1
-; CHECK-NEXT:    mov x20, xzr
-; CHECK-NEXT:    orr x9, x19, x9
-; CHECK-NEXT:    mov x19, xzr
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    cbnz x9, .LBB3_4
-; CHECK-NEXT:  .LBB3_5: // %udiv-loop-exit
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x4, x2, x1, #63
-; CHECK-NEXT:    extr x7, x3, x2, #63
-; CHECK-NEXT:    extr x3, x5, x3, #63
-; CHECK-NEXT:    orr x17, x6, x1, lsl #1
-; CHECK-NEXT:    ldr x23, [sp, #8] // 8-byte Reload
-; CHECK-NEXT:  .LBB3_6: // %udiv-end
-; CHECK-NEXT:    umulh x10, x17, x15
-; CHECK-NEXT:    ldp x20, x19, [sp, #240] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT:    umulh x9, x7, x13
-; CHECK-NEXT:    ldp x28, x27, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    madd x10, x17, x16, x10
-; CHECK-NEXT:    madd x9, x7, x14, x9
-; CHECK-NEXT:    madd x10, x4, x15, x10
-; CHECK-NEXT:    mul x12, x17, x15
-; CHECK-NEXT:    mul x15, x7, x13
-; CHECK-NEXT:    madd x9, x3, x13, x9
-; CHECK-NEXT:    umulh x18, x13, x17
-; CHECK-NEXT:    adds x12, x15, x12
-; CHECK-NEXT:    mul x0, x14, x17
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    umulh x16, x14, x17
-; CHECK-NEXT:    mul x2, x13, x4
-; CHECK-NEXT:    adds x10, x0, x18
-; CHECK-NEXT:    umulh x1, x13, x4
-; CHECK-NEXT:    cinc x15, x16, hs
-; CHECK-NEXT:    umulh x3, x14, x4
-; CHECK-NEXT:    adds x10, x2, x10
-; CHECK-NEXT:    mul x14, x14, x4
-; CHECK-NEXT:    cinc x16, x1, hs
-; CHECK-NEXT:    mul x13, x13, x17
-; CHECK-NEXT:    adds x15, x15, x16
-; CHECK-NEXT:    cset w16, hs
-; CHECK-NEXT:    adds x14, x14, x15
-; CHECK-NEXT:    adc x15, x3, x16
-; CHECK-NEXT:    adds x12, x14, x12
-; CHECK-NEXT:    adc x9, x15, x9
-; CHECK-NEXT:    subs x13, x21, x13
-; CHECK-NEXT:    sbcs x10, x22, x10
-; CHECK-NEXT:    eor x13, x13, x8
-; CHECK-NEXT:    sbcs x12, x23, x12
-; CHECK-NEXT:    eor x10, x10, x8
-; CHECK-NEXT:    sbc x9, x11, x9
-; CHECK-NEXT:    subs x0, x13, x8
-; CHECK-NEXT:    eor x11, x12, x8
-; CHECK-NEXT:    sbcs x1, x10, x8
-; CHECK-NEXT:    ldp x22, x21, [sp, #224] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT:    eor x9, x9, x8
-; CHECK-NEXT:    sbcs x2, x11, x8
-; CHECK-NEXT:    sbc x3, x9, x8
-; CHECK-NEXT:    add sp, sp, #256
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __modoi3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = srem i256 %a, %b
   ret i256 %r
@@ -932,186 +61,14 @@ define i256 @udiv256_pow2(i256 %a) nounwind {
 ; Division by constant
 define i256 @sdiv256_const(i256 %a) nounwind {
 ; CHECK-LABEL: sdiv256_const:
-; CHECK:       // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    mov w13, #255 // =0xff
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x10, x1, x8
-; CHECK-NEXT:    subs x18, x9, x8
-; CHECK-NEXT:    eor x9, x2, x8
-; CHECK-NEXT:    sbcs x0, x10, x8
-; CHECK-NEXT:    eor x10, x3, x8
-; CHECK-NEXT:    sbcs x1, x9, x8
-; CHECK-NEXT:    clz x9, x18
-; CHECK-NEXT:    clz x11, x0
-; CHECK-NEXT:    sbcs x5, x10, x8
-; CHECK-NEXT:    clz x10, x1
-; CHECK-NEXT:    add x9, x9, #64
-; CHECK-NEXT:    add x10, x10, #64
-; CHECK-NEXT:    clz x12, x5
-; CHECK-NEXT:    orr x14, x0, x5
-; CHECK-NEXT:    csel x10, x12, x10, ne
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    orr x12, x1, x5
-; CHECK-NEXT:    csel x9, x11, x9, ne
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    mov w11, #253 // =0xfd
-; CHECK-NEXT:    add x9, x9, #128
-; CHECK-NEXT:    csel x9, x10, x9, ne
-; CHECK-NEXT:    subs x9, x11, x9
-; CHECK-NEXT:    ngcs x10, xzr
-; CHECK-NEXT:    ngcs x11, xzr
-; CHECK-NEXT:    ngc x12, xzr
-; CHECK-NEXT:    cmp x13, x9
-; CHECK-NEXT:    orr x13, x18, x1
-; CHECK-NEXT:    ngcs xzr, x10
-; CHECK-NEXT:    orr x13, x13, x14
-; CHECK-NEXT:    ngcs xzr, x11
-; CHECK-NEXT:    ngcs xzr, x12
-; CHECK-NEXT:    ccmp x13, #0, #4, hs
-; CHECK-NEXT:    csel x13, xzr, x5, eq
-; CHECK-NEXT:    csel x15, xzr, x1, eq
-; CHECK-NEXT:    csel x2, xzr, x0, eq
-; CHECK-NEXT:    csel x14, xzr, x18, eq
-; CHECK-NEXT:    b.eq .LBB5_6
-; CHECK-NEXT:  // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT:    eor x16, x9, #0xff
-; CHECK-NEXT:    orr x17, x10, x12
-; CHECK-NEXT:    orr x16, x16, x11
-; CHECK-NEXT:    orr x16, x16, x17
-; CHECK-NEXT:    cbz x16, .LBB5_6
-; CHECK-NEXT:  // %bb.2: // %udiv-bb1
-; CHECK-NEXT:    sub sp, sp, #192
-; CHECK-NEXT:    mov w13, #255 // =0xff
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    add x15, sp, #64
-; CHECK-NEXT:    sub x13, x13, x9
-; CHECK-NEXT:    add x15, x15, #32
-; CHECK-NEXT:    stp x26, x25, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x14, x13, #3
-; CHECK-NEXT:    stp x24, x23, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT:    adds x9, x9, #1
-; CHECK-NEXT:    stp x22, x21, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    and x3, x13, #0x3f
-; CHECK-NEXT:    adcs x10, x10, xzr
-; CHECK-NEXT:    and x14, x14, #0x18
-; CHECK-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT:    mvn w4, w13
-; CHECK-NEXT:    stp x18, x0, [sp, #96]
-; CHECK-NEXT:    sub x14, x15, x14
-; CHECK-NEXT:    eor x3, x3, #0x3f
-; CHECK-NEXT:    stp x1, x5, [sp, #112]
-; CHECK-NEXT:    adcs x11, x11, xzr
-; CHECK-NEXT:    stp q0, q0, [sp, #64]
-; CHECK-NEXT:    ldp x16, x2, [x14, #8]
-; CHECK-NEXT:    ldr x17, [x14, #24]
-; CHECK-NEXT:    ldr x14, [x14]
-; CHECK-NEXT:    lsl x17, x17, x13
-; CHECK-NEXT:    lsr x15, x16, #1
-; CHECK-NEXT:    lsr x6, x2, #1
-; CHECK-NEXT:    lsr x7, x14, #1
-; CHECK-NEXT:    lsl x2, x2, x13
-; CHECK-NEXT:    lsl x19, x16, x13
-; CHECK-NEXT:    lsr x4, x15, x4
-; CHECK-NEXT:    lsr x6, x6, x3
-; CHECK-NEXT:    lsr x3, x7, x3
-; CHECK-NEXT:    adcs x15, x12, xzr
-; CHECK-NEXT:    lsl x12, x14, x13
-; CHECK-NEXT:    cset w7, hs
-; CHECK-NEXT:    orr x16, x17, x6
-; CHECK-NEXT:    orr x14, x2, x4
-; CHECK-NEXT:    orr x13, x19, x3
-; CHECK-NEXT:    mov x17, xzr
-; CHECK-NEXT:    tbnz w7, #0, .LBB5_5
-; CHECK-NEXT:  // %bb.3: // %udiv-preheader
-; CHECK-NEXT:    lsr x6, x9, #3
-; CHECK-NEXT:    stp x18, x0, [sp]
-; CHECK-NEXT:    mov x0, sp
-; CHECK-NEXT:    stp q0, q0, [sp, #32]
-; CHECK-NEXT:    mvn w7, w9
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    and x18, x6, #0x18
-; CHECK-NEXT:    stp x1, x5, [sp, #16]
-; CHECK-NEXT:    and x5, x9, #0x3f
-; CHECK-NEXT:    add x0, x0, x18
-; CHECK-NEXT:    mov w18, #7 // =0x7
-; CHECK-NEXT:    eor x5, x5, #0x3f
-; CHECK-NEXT:    ldp x1, x6, [x0, #16]
-; CHECK-NEXT:    mov x4, xzr
-; CHECK-NEXT:    ldp x23, x21, [x0]
-; CHECK-NEXT:    subs x0, x18, #1
-; CHECK-NEXT:    mov x2, xzr
-; CHECK-NEXT:    lsl x20, x1, #1
-; CHECK-NEXT:    lsl x19, x6, #1
-; CHECK-NEXT:    lsr x22, x1, x9
-; CHECK-NEXT:    mov x1, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x23, x23, x9
-; CHECK-NEXT:    lsl x7, x20, x7
-; CHECK-NEXT:    lsl x20, x21, #1
-; CHECK-NEXT:    lsl x19, x19, x5
-; CHECK-NEXT:    lsr x21, x21, x9
-; CHECK-NEXT:    lsl x24, x20, x5
-; CHECK-NEXT:    adcs x5, xzr, x1
-; CHECK-NEXT:    lsr x20, x6, x9
-; CHECK-NEXT:    adcs x6, xzr, x1
-; CHECK-NEXT:    orr x19, x19, x22
-; CHECK-NEXT:    orr x21, x21, x7
-; CHECK-NEXT:    orr x22, x24, x23
-; CHECK-NEXT:    adc x7, xzr, x1
-; CHECK-NEXT:  .LBB5_4: // %udiv-do-while
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    extr x23, x22, x16, #63
-; CHECK-NEXT:    extr x24, x21, x22, #63
-; CHECK-NEXT:    extr x25, x19, x21, #63
-; CHECK-NEXT:    extr x20, x20, x19, #63
-; CHECK-NEXT:    extr x16, x16, x14, #63
-; CHECK-NEXT:    extr x14, x14, x13, #63
-; CHECK-NEXT:    cmp x0, x23
-; CHECK-NEXT:    sbcs xzr, x5, x24
-; CHECK-NEXT:    orr x14, x4, x14
-; CHECK-NEXT:    orr x16, x2, x16
-; CHECK-NEXT:    sbcs xzr, x6, x25
-; CHECK-NEXT:    mov x2, xzr
-; CHECK-NEXT:    sbc x19, x7, x20
-; CHECK-NEXT:    asr x26, x19, #63
-; CHECK-NEXT:    and x19, x26, x18
-; CHECK-NEXT:    subs x22, x23, x19
-; CHECK-NEXT:    extr x23, x13, x12, #63
-; CHECK-NEXT:    orr x12, x17, x12, lsl #1
-; CHECK-NEXT:    sbcs x21, x24, xzr
-; CHECK-NEXT:    and x17, x26, #0x1
-; CHECK-NEXT:    sbcs x19, x25, xzr
-; CHECK-NEXT:    orr x13, x3, x23
-; CHECK-NEXT:    sbc x20, x20, xzr
-; CHECK-NEXT:    subs x9, x9, #1
-; CHECK-NEXT:    adcs x10, x10, x1
-; CHECK-NEXT:    adcs x11, x11, x1
-; CHECK-NEXT:    adc x15, x15, x1
-; CHECK-NEXT:    orr x4, x9, x11
-; CHECK-NEXT:    orr x3, x10, x15
-; CHECK-NEXT:    orr x23, x4, x3
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    mov x4, xzr
-; CHECK-NEXT:    cbnz x23, .LBB5_4
-; CHECK-NEXT:  .LBB5_5: // %udiv-loop-exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x2, x13, x12, #63
-; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x15, x14, x13, #63
-; CHECK-NEXT:    ldp x24, x23, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT:    extr x13, x16, x14, #63
-; CHECK-NEXT:    ldp x26, x25, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT:    orr x14, x17, x12, lsl #1
-; CHECK-NEXT:    add sp, sp, #192
-; CHECK-NEXT:  .LBB5_6: // %udiv-end
-; CHECK-NEXT:    eor x9, x14, x8
-; CHECK-NEXT:    eor x10, x2, x8
-; CHECK-NEXT:    subs x0, x9, x8
-; CHECK-NEXT:    eor x9, x15, x8
-; CHECK-NEXT:    sbcs x1, x10, x8
-; CHECK-NEXT:    eor x10, x13, x8
-; CHECK-NEXT:    sbcs x2, x9, x8
-; CHECK-NEXT:    sbc x3, x10, x8
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w4, #7 // =0x7
+; CHECK-NEXT:    mov x5, xzr
+; CHECK-NEXT:    mov x6, xzr
+; CHECK-NEXT:    mov x7, xzr
+; CHECK-NEXT:    bl __divoi3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = sdiv i256 %a, 7
   ret i256 %r
diff --git a/llvm/test/CodeGen/AArch64/shift-i256.ll b/llvm/test/CodeGen/AArch64/shift-i256.ll
index cde8144643575..ade849bc8b2fd 100644
--- a/llvm/test/CodeGen/AArch64/shift-i256.ll
+++ b/llvm/test/CodeGen/AArch64/shift-i256.ll
@@ -5,7 +5,10 @@
 define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-LABEL: shl_i256:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #80
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    lsr x8, x4, #3
 ; CHECK-NEXT:    mov x9, sp
@@ -33,7 +36,8 @@ define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-NEXT:    orr x1, x9, x15
 ; CHECK-NEXT:    orr x2, x12, x13
 ; CHECK-NEXT:    orr x3, x8, x11
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = shl i256 %a, %amt
   ret i256 %r
@@ -43,7 +47,10 @@ define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
 define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-LABEL: lshr_i256:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #80
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    lsr x8, x4, #3
 ; CHECK-NEXT:    mov x9, sp
@@ -53,7 +60,7 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-NEXT:    and x8, x8, #0x18
 ; CHECK-NEXT:    stp x0, x1, [sp]
 ; CHECK-NEXT:    eor x11, x11, #0x3f
-; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    orr x8, x9, x8
 ; CHECK-NEXT:    stp q0, q0, [sp, #32]
 ; CHECK-NEXT:    ldp x9, x10, [x8]
 ; CHECK-NEXT:    ldp x12, x8, [x8, #16]
@@ -70,7 +77,8 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-NEXT:    orr x0, x14, x9
 ; CHECK-NEXT:    orr x1, x10, x8
 ; CHECK-NEXT:    orr x2, x11, x12
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = lshr i256 %a, %amt
   ret i256 %r
@@ -80,7 +88,10 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
 define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-LABEL: ashr_i256:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #80
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    lsr x9, x4, #3
 ; CHECK-NEXT:    asr x8, x3, #63
 ; CHECK-NEXT:    mov x10, sp
@@ -91,7 +102,7 @@ define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-NEXT:    stp x0, x1, [sp]
 ; CHECK-NEXT:    eor x13, x13, #0x3f
 ; CHECK-NEXT:    stp x8, x8, [sp, #48]
-; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    orr x9, x10, x9
 ; CHECK-NEXT:    stp x8, x8, [sp, #32]
 ; CHECK-NEXT:    ldp x10, x8, [x9, #8]
 ; CHECK-NEXT:    ldr x11, [x9]
@@ -109,7 +120,8 @@ define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
 ; CHECK-NEXT:    orr x0, x15, x11
 ; CHECK-NEXT:    orr x1, x10, x12
 ; CHECK-NEXT:    orr x2, x13, x8
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = ashr i256 %a, %amt
   ret i256 %r
@@ -155,3 +167,4 @@ define i256 @ashr_i256_const(i256 %a) nounwind {
   %r = ashr i256 %a, 17
   ret i256 %r
 }
+
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index b02788ab1b34c..f1555c816b36d 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -179,7 +179,10 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
@@ -188,7 +191,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    mov x8, sp
 ; ALL-NEXT:    and x9, x10, #0x18
 ; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    add x8, x8, x9
+; ALL-NEXT:    orr x8, x8, x9
 ; ALL-NEXT:    lsl x9, x10, #3
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
 ; ALL-NEXT:    ldp x11, x10, [x8, #16]
@@ -196,21 +199,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldp x8, x12, [x8]
 ; ALL-NEXT:    and x9, x9, #0x38
 ; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsl x16, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x9
-; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    lsl x15, x12, #1
 ; ALL-NEXT:    lsr x10, x10, x9
-; ALL-NEXT:    lsr x12, x12, x9
-; ALL-NEXT:    lsl x14, x14, x13
 ; ALL-NEXT:    lsr x8, x8, x9
-; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x9, x12, x9
+; ALL-NEXT:    lsl x12, x16, x13
 ; ALL-NEXT:    lsl x13, x15, x13
 ; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    orr x8, x9, x8
-; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    orr x9, x9, x12
+; ALL-NEXT:    orr x8, x13, x8
 ; ALL-NEXT:    stp x11, x10, [x2, #16]
 ; ALL-NEXT:    stp x8, x9, [x2]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -223,22 +227,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes_dwordOff:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    ubfiz x8, x10, #3, #2
-; ALL-NEXT:    mov x9, sp
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    bfi x8, x10, #3, #2
 ; ALL-NEXT:    str q1, [sp]
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
-; ALL-NEXT:    add x8, x9, x8
-; ALL-NEXT:    ldp x10, x9, [x8, #16]
+; ALL-NEXT:    ldp x9, x10, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
 ; ALL-NEXT:    str q0, [x2]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    stp x9, x10, [x2, #16]
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -251,7 +258,10 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
@@ -283,7 +293,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    orr x9, x12, x13
 ; ALL-NEXT:    stp x10, x11, [x2]
 ; ALL-NEXT:    stp x9, x8, [x2, #16]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -296,7 +307,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes_dwordOff:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
@@ -312,7 +326,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; ALL-NEXT:    ldr q0, [x8]
 ; ALL-NEXT:    str q0, [x2]
 ; ALL-NEXT:    stp x9, x10, [x2, #16]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -325,7 +340,10 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
@@ -334,7 +352,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    asr x8, x8, #63
 ; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    add x9, x9, x11
+; ALL-NEXT:    orr x9, x9, x11
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
 ; ALL-NEXT:    lsl x8, x10, #3
@@ -343,21 +361,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    mvn w13, w8
 ; ALL-NEXT:    and x8, x8, #0x38
 ; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsl x16, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x8
-; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    lsl x15, x12, #1
 ; ALL-NEXT:    asr x10, x10, x8
-; ALL-NEXT:    lsr x12, x12, x8
+; ALL-NEXT:    lsr x9, x9, x8
 ; ALL-NEXT:    lsl x14, x14, x13
-; ALL-NEXT:    lsr x8, x9, x8
-; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsr x8, x12, x8
+; ALL-NEXT:    lsl x12, x16, x13
 ; ALL-NEXT:    lsl x13, x15, x13
 ; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    orr x8, x9, x8
-; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    orr x8, x8, x12
+; ALL-NEXT:    orr x9, x13, x9
 ; ALL-NEXT:    stp x11, x10, [x2, #16]
-; ALL-NEXT:    stp x8, x9, [x2]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -370,23 +389,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes_dwordOff:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
 ; ALL-NEXT:    asr x8, x8, #63
-; ALL-NEXT:    ubfiz x9, x10, #3, #2
-; ALL-NEXT:    mov x10, sp
+; ALL-NEXT:    mov x9, sp
+; ALL-NEXT:    bfi x9, x10, #3, #2
 ; ALL-NEXT:    str q0, [sp]
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    add x8, x10, x9
-; ALL-NEXT:    ldp x10, x9, [x8, #16]
-; ALL-NEXT:    ldr q0, [x8]
+; ALL-NEXT:    ldp x8, x10, [x9, #16]
+; ALL-NEXT:    ldr q0, [x9]
 ; ALL-NEXT:    str q0, [x2]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    stp x8, x10, [x2, #16]
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index 92fd4fe30980c..609cd8909bf5f 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -154,7 +154,10 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
@@ -163,30 +166,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    lsr x8, x10, #3
 ; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x13, x10, #0x3f
+; ALL-NEXT:    mvn w14, w10
 ; ALL-NEXT:    and x8, x8, #0x18
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
-; ALL-NEXT:    eor x12, x12, #0x3f
-; ALL-NEXT:    add x8, x9, x8
-; ALL-NEXT:    ldp x13, x9, [x8]
-; ALL-NEXT:    ldp x8, x11, [x8, #16]
+; ALL-NEXT:    eor x13, x13, #0x3f
+; ALL-NEXT:    orr x8, x9, x8
+; ALL-NEXT:    ldp x11, x9, [x8, #16]
+; ALL-NEXT:    ldp x12, x8, [x8]
 ; ALL-NEXT:    lsl x15, x9, #1
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsr x13, x13, x10
-; ALL-NEXT:    lsl x14, x11, #1
+; ALL-NEXT:    lsl x17, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsl x14, x14, x12
-; ALL-NEXT:    lsl x12, x15, x12
-; ALL-NEXT:    lsl x15, x8, #1
+; ALL-NEXT:    lsl x16, x8, #1
+; ALL-NEXT:    lsr x9, x9, x10
+; ALL-NEXT:    lsr x12, x12, x10
+; ALL-NEXT:    lsl x15, x15, x13
 ; ALL-NEXT:    lsr x8, x8, x10
-; ALL-NEXT:    mvn w10, w10
-; ALL-NEXT:    lsl x10, x15, x10
-; ALL-NEXT:    orr x8, x14, x8
-; ALL-NEXT:    stp x8, x11, [x2, #16]
-; ALL-NEXT:    orr x11, x12, x13
-; ALL-NEXT:    orr x8, x9, x10
-; ALL-NEXT:    stp x11, x8, [x2]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    lsl x10, x17, x14
+; ALL-NEXT:    lsl x13, x16, x13
+; ALL-NEXT:    orr x11, x15, x11
+; ALL-NEXT:    orr x8, x8, x10
+; ALL-NEXT:    stp x11, x9, [x2, #16]
+; ALL-NEXT:    orr x9, x13, x12
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -197,7 +201,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
 ; ALL-NEXT:    ldr x10, [x1]
@@ -230,7 +237,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    orr x11, x13, x14
 ; ALL-NEXT:    orr x8, x9, x10
 ; ALL-NEXT:    stp x8, x11, [x2, #16]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -241,7 +249,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT:    sub x9, sp, #80
+; ALL-NEXT:    mov x29, sp
+; ALL-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    mov x11, sp
 ; ALL-NEXT:    ldr x10, [x1]
@@ -250,31 +261,32 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    lsr x9, x10, #3
 ; ALL-NEXT:    asr x8, x8, #63
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x13, x10, #0x3f
+; ALL-NEXT:    mvn w14, w10
 ; ALL-NEXT:    and x9, x9, #0x18
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
-; ALL-NEXT:    eor x12, x12, #0x3f
+; ALL-NEXT:    eor x13, x13, #0x3f
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    add x8, x11, x9
-; ALL-NEXT:    ldp x13, x9, [x8]
-; ALL-NEXT:    ldp x8, x11, [x8, #16]
+; ALL-NEXT:    orr x8, x11, x9
+; ALL-NEXT:    ldp x11, x9, [x8, #16]
+; ALL-NEXT:    ldp x12, x8, [x8]
 ; ALL-NEXT:    lsl x15, x9, #1
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsr x13, x13, x10
-; ALL-NEXT:    lsl x14, x11, #1
-; ALL-NEXT:    asr x11, x11, x10
-; ALL-NEXT:    lsl x14, x14, x12
-; ALL-NEXT:    lsl x12, x15, x12
-; ALL-NEXT:    lsl x15, x8, #1
+; ALL-NEXT:    lsl x17, x11, #1
+; ALL-NEXT:    lsr x11, x11, x10
+; ALL-NEXT:    lsl x16, x8, #1
+; ALL-NEXT:    asr x9, x9, x10
+; ALL-NEXT:    lsr x12, x12, x10
+; ALL-NEXT:    lsl x15, x15, x13
 ; ALL-NEXT:    lsr x8, x8, x10
-; ALL-NEXT:    mvn w10, w10
-; ALL-NEXT:    lsl x10, x15, x10
-; ALL-NEXT:    orr x8, x14, x8
-; ALL-NEXT:    stp x8, x11, [x2, #16]
-; ALL-NEXT:    orr x11, x12, x13
-; ALL-NEXT:    orr x8, x9, x10
-; ALL-NEXT:    stp x11, x8, [x2]
-; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    lsl x10, x17, x14
+; ALL-NEXT:    lsl x13, x16, x13
+; ALL-NEXT:    orr x11, x15, x11
+; ALL-NEXT:    orr x8, x8, x10
+; ALL-NEXT:    stp x11, x9, [x2, #16]
+; ALL-NEXT:    orr x9, x13, x12
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    mov sp, x29
+; ALL-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/i256-arith.ll b/llvm/test/CodeGen/RISCV/i256-arith.ll
new file mode 100644
index 0000000000000..45da20f332b84
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/i256-arith.ll
@@ -0,0 +1,1442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64
+
+; i256 add
+define i256 @add_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: add_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a3, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw a6, 8(a2)
+; RV32-NEXT:    lw a5, 12(a2)
+; RV32-NEXT:    lw t2, 4(a1)
+; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw t1, 8(a1)
+; RV32-NEXT:    lw t0, 12(a1)
+; RV32-NEXT:    add a4, t2, a4
+; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    sltu a7, a3, a7
+; RV32-NEXT:    add a4, a4, a7
+; RV32-NEXT:    beq a4, t2, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a7, a4, t2
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    add t2, t0, a5
+; RV32-NEXT:    add a5, a6, a7
+; RV32-NEXT:    sltu t3, a6, t1
+; RV32-NEXT:    sltu a6, a5, a6
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add a6, t2, a6
+; RV32-NEXT:    beq a6, t0, .LBB0_4
+; RV32-NEXT:  # %bb.3:
+; RV32-NEXT:    sltu t3, a6, t0
+; RV32-NEXT:    j .LBB0_5
+; RV32-NEXT:  .LBB0_4:
+; RV32-NEXT:    sltu t3, a5, t1
+; RV32-NEXT:  .LBB0_5:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s0, 16(a2)
+; RV32-NEXT:    lw t6, 20(a2)
+; RV32-NEXT:    lw t5, 16(a1)
+; RV32-NEXT:    lw t2, 20(a1)
+; RV32-NEXT:    xor t1, a5, t1
+; RV32-NEXT:    xor t0, a6, t0
+; RV32-NEXT:    or t0, t1, t0
+; RV32-NEXT:    beqz t0, .LBB0_7
+; RV32-NEXT:  # %bb.6:
+; RV32-NEXT:    mv a7, t3
+; RV32-NEXT:  .LBB0_7:
+; RV32-NEXT:    lw t4, 24(a2)
+; RV32-NEXT:    lw t0, 28(a2)
+; RV32-NEXT:    lw a2, 24(a1)
+; RV32-NEXT:    lw t3, 28(a1)
+; RV32-NEXT:    add s0, t5, s0
+; RV32-NEXT:    add t6, t2, t6
+; RV32-NEXT:    add a7, s0, a7
+; RV32-NEXT:    sltu t1, s0, t5
+; RV32-NEXT:    sltu t5, a7, s0
+; RV32-NEXT:    add t6, t6, t1
+; RV32-NEXT:    add a1, t6, t5
+; RV32-NEXT:    sltu s0, a1, t6
+; RV32-NEXT:    and t5, t5, s0
+; RV32-NEXT:    beq t6, t2, .LBB0_9
+; RV32-NEXT:  # %bb.8:
+; RV32-NEXT:    sltu t1, t6, t2
+; RV32-NEXT:  .LBB0_9:
+; RV32-NEXT:    add t4, a2, t4
+; RV32-NEXT:    add t0, t3, t0
+; RV32-NEXT:    sw a3, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a6, 12(a0)
+; RV32-NEXT:    add t1, t4, t1
+; RV32-NEXT:    sltu a2, t4, a2
+; RV32-NEXT:    add t5, t1, t5
+; RV32-NEXT:    sltu a3, t1, t4
+; RV32-NEXT:    add a2, t0, a2
+; RV32-NEXT:    sltu a4, t5, t1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    sw a7, 16(a0)
+; RV32-NEXT:    sw a1, 20(a0)
+; RV32-NEXT:    sw t5, 24(a0)
+; RV32-NEXT:    sw a2, 28(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a5, 0(a2)
+; RV64-NEXT:    ld t0, 8(a2)
+; RV64-NEXT:    ld a4, 16(a2)
+; RV64-NEXT:    ld a2, 24(a2)
+; RV64-NEXT:    ld a7, 8(a1)
+; RV64-NEXT:    ld t1, 0(a1)
+; RV64-NEXT:    ld a3, 16(a1)
+; RV64-NEXT:    ld a6, 24(a1)
+; RV64-NEXT:    add t2, a7, t0
+; RV64-NEXT:    add a1, t1, a5
+; RV64-NEXT:    sltu t0, a1, t1
+; RV64-NEXT:    add a5, t2, t0
+; RV64-NEXT:    beq a5, a7, .LBB0_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    sltu t0, a5, a7
+; RV64-NEXT:  .LBB0_2:
+; RV64-NEXT:    add a4, a3, a4
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    add t0, a4, t0
+; RV64-NEXT:    sltu a3, a4, a3
+; RV64-NEXT:    sltu a4, t0, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    sd a5, 8(a0)
+; RV64-NEXT:    sd t0, 16(a0)
+; RV64-NEXT:    sd a2, 24(a0)
+; RV64-NEXT:    ret
+  %r = add i256 %a, %b
+  ret i256 %r
+}
+
+; i256 sub
+define i256 @sub_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: sub_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw a7, 8(a2)
+; RV32-NEXT:    lw t3, 12(a2)
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw t2, 8(a1)
+; RV32-NEXT:    lw t5, 12(a1)
+; RV32-NEXT:    lw t0, 4(a1)
+; RV32-NEXT:    sltu a6, t2, a7
+; RV32-NEXT:    mv s2, a6
+; RV32-NEXT:    beq t5, t3, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu s2, t5, t3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    sltu t1, a5, a3
+; RV32-NEXT:    mv t4, t1
+; RV32-NEXT:    beq t0, a4, .LBB1_4
+; RV32-NEXT:  # %bb.3:
+; RV32-NEXT:    sltu t4, t0, a4
+; RV32-NEXT:  .LBB1_4:
+; RV32-NEXT:    lw s7, 16(a2)
+; RV32-NEXT:    lw s0, 20(a2)
+; RV32-NEXT:    lw s8, 16(a1)
+; RV32-NEXT:    lw s1, 20(a1)
+; RV32-NEXT:    xor t6, t5, t3
+; RV32-NEXT:    xor s3, t2, a7
+; RV32-NEXT:    or s3, s3, t6
+; RV32-NEXT:    mv t6, t4
+; RV32-NEXT:    beqz s3, .LBB1_6
+; RV32-NEXT:  # %bb.5:
+; RV32-NEXT:    mv t6, s2
+; RV32-NEXT:  .LBB1_6:
+; RV32-NEXT:    lw s4, 24(a2)
+; RV32-NEXT:    lw s3, 28(a2)
+; RV32-NEXT:    lw s6, 24(a1)
+; RV32-NEXT:    lw s5, 28(a1)
+; RV32-NEXT:    sub s2, s8, s7
+; RV32-NEXT:    sltu a2, s8, s7
+; RV32-NEXT:    sub a1, s1, s0
+; RV32-NEXT:    sltu s7, s2, t6
+; RV32-NEXT:    sub s8, a1, a2
+; RV32-NEXT:    snez a1, s8
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a1, a1, s7
+; RV32-NEXT:    beq s1, s0, .LBB1_8
+; RV32-NEXT:  # %bb.7:
+; RV32-NEXT:    sltu a2, s1, s0
+; RV32-NEXT:  .LBB1_8:
+; RV32-NEXT:    sub s0, s6, s4
+; RV32-NEXT:    sltu s1, s6, s4
+; RV32-NEXT:    sub s3, s5, s3
+; RV32-NEXT:    sub s4, s8, s7
+; RV32-NEXT:    sub t6, s2, t6
+; RV32-NEXT:    sub t3, t5, t3
+; RV32-NEXT:    sub a7, t2, a7
+; RV32-NEXT:    sub a4, t0, a4
+; RV32-NEXT:    sub a5, a5, a3
+; RV32-NEXT:    sub a3, s0, a2
+; RV32-NEXT:    sub t0, s3, s1
+; RV32-NEXT:    sltu a2, s0, a2
+; RV32-NEXT:    sub a6, t3, a6
+; RV32-NEXT:    sltu t2, a7, t4
+; RV32-NEXT:    sub a7, a7, t4
+; RV32-NEXT:    sub a4, a4, t1
+; RV32-NEXT:    sltu t1, a3, a1
+; RV32-NEXT:    sub a2, t0, a2
+; RV32-NEXT:    sub a3, a3, a1
+; RV32-NEXT:    sub a1, a6, t2
+; RV32-NEXT:    sub a2, a2, t1
+; RV32-NEXT:    sw a5, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a7, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw t6, 16(a0)
+; RV32-NEXT:    sw s4, 20(a0)
+; RV32-NEXT:    sw a3, 24(a0)
+; RV32-NEXT:    sw a2, 28(a0)
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sub_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a3, 0(a2)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld a5, 16(a2)
+; RV64-NEXT:    ld a6, 24(a2)
+; RV64-NEXT:    ld t0, 16(a1)
+; RV64-NEXT:    ld a7, 24(a1)
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    ld a1, 8(a1)
+; RV64-NEXT:    sltu t1, t0, a5
+; RV64-NEXT:    sub a6, a7, a6
+; RV64-NEXT:    sltu a7, a2, a3
+; RV64-NEXT:    sub a6, a6, t1
+; RV64-NEXT:    mv t1, a7
+; RV64-NEXT:    beq a1, a4, .LBB1_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    sltu t1, a1, a4
+; RV64-NEXT:  .LBB1_2:
+; RV64-NEXT:    sub a5, t0, a5
+; RV64-NEXT:    sub a1, a1, a4
+; RV64-NEXT:    sub a2, a2, a3
+; RV64-NEXT:    sltu a3, a5, t1
+; RV64-NEXT:    sub a4, a5, t1
+; RV64-NEXT:    sub a1, a1, a7
+; RV64-NEXT:    sub a3, a6, a3
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sd a1, 8(a0)
+; RV64-NEXT:    sd a4, 16(a0)
+; RV64-NEXT:    sd a3, 24(a0)
+; RV64-NEXT:    ret
+  %r = sub i256 %a, %b
+  ret i256 %r
+}
+
+; i256 shift left by constant
+define i256 @shl_i256_const(i256 %a) nounwind {
+; RV32-LABEL: shl_i256_const:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a4, 8(a1)
+; RV32-NEXT:    lw a5, 12(a1)
+; RV32-NEXT:    lw a6, 16(a1)
+; RV32-NEXT:    lw a1, 20(a1)
+; RV32-NEXT:    sw a4, 16(a0)
+; RV32-NEXT:    sw a5, 20(a0)
+; RV32-NEXT:    sw a6, 24(a0)
+; RV32-NEXT:    sw a1, 28(a0)
+; RV32-NEXT:    sw zero, 0(a0)
+; RV32-NEXT:    sw zero, 4(a0)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shl_i256_const:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    ld a3, 8(a1)
+; RV64-NEXT:    ld a1, 16(a1)
+; RV64-NEXT:    sd zero, 0(a0)
+; RV64-NEXT:    sd a2, 8(a0)
+; RV64-NEXT:    sd a3, 16(a0)
+; RV64-NEXT:    sd a1, 24(a0)
+; RV64-NEXT:    ret
+  %r = shl i256 %a, 64
+  ret i256 %r
+}
+
+; i256 shift left by variable
+define i256 @shl_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: shl_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a5, 8(a1)
+; RV32-NEXT:    lw a6, 12(a1)
+; RV32-NEXT:    lw a7, 16(a1)
+; RV32-NEXT:    lw t0, 20(a1)
+; RV32-NEXT:    lw t1, 24(a1)
+; RV32-NEXT:    lw a1, 28(a1)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw zero, 28(sp)
+; RV32-NEXT:    sw zero, 32(sp)
+; RV32-NEXT:    sw zero, 36(sp)
+; RV32-NEXT:    sw zero, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    sw zero, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    addi t2, sp, 40
+; RV32-NEXT:    sw a7, 56(sp)
+; RV32-NEXT:    sw t0, 60(sp)
+; RV32-NEXT:    sw t1, 64(sp)
+; RV32-NEXT:    sw a1, 68(sp)
+; RV32-NEXT:    sw a3, 40(sp)
+; RV32-NEXT:    sw a4, 44(sp)
+; RV32-NEXT:    sw a5, 48(sp)
+; RV32-NEXT:    sw a6, 52(sp)
+; RV32-NEXT:    srli a1, a2, 3
+; RV32-NEXT:    andi a3, a2, 31
+; RV32-NEXT:    andi a1, a1, 28
+; RV32-NEXT:    xori a3, a3, 31
+; RV32-NEXT:    sub a1, t2, a1
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a5, 4(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a7, 12(a1)
+; RV32-NEXT:    lw t0, 16(a1)
+; RV32-NEXT:    lw t1, 20(a1)
+; RV32-NEXT:    lw t2, 24(a1)
+; RV32-NEXT:    lw a1, 28(a1)
+; RV32-NEXT:    sll t3, a5, a2
+; RV32-NEXT:    srli t4, a4, 1
+; RV32-NEXT:    sll t5, a6, a2
+; RV32-NEXT:    srli a5, a5, 1
+; RV32-NEXT:    sll t6, a7, a2
+; RV32-NEXT:    srli a6, a6, 1
+; RV32-NEXT:    sll s0, t0, a2
+; RV32-NEXT:    srli a7, a7, 1
+; RV32-NEXT:    sll s1, t1, a2
+; RV32-NEXT:    srli t0, t0, 1
+; RV32-NEXT:    sll a1, a1, a2
+; RV32-NEXT:    sll a4, a4, a2
+; RV32-NEXT:    sll a2, t2, a2
+; RV32-NEXT:    srli t1, t1, 1
+; RV32-NEXT:    srli t2, t2, 1
+; RV32-NEXT:    srl t4, t4, a3
+; RV32-NEXT:    srl a5, a5, a3
+; RV32-NEXT:    srl a6, a6, a3
+; RV32-NEXT:    srl a7, a7, a3
+; RV32-NEXT:    srl t0, t0, a3
+; RV32-NEXT:    srl t1, t1, a3
+; RV32-NEXT:    srl a3, t2, a3
+; RV32-NEXT:    or t2, t3, t4
+; RV32-NEXT:    or a5, t5, a5
+; RV32-NEXT:    or a6, t6, a6
+; RV32-NEXT:    or a7, s0, a7
+; RV32-NEXT:    or t0, s1, t0
+; RV32-NEXT:    or a2, a2, t1
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    sw a7, 16(a0)
+; RV32-NEXT:    sw t0, 20(a0)
+; RV32-NEXT:    sw a2, 24(a0)
+; RV32-NEXT:    sw a1, 28(a0)
+; RV32-NEXT:    sw a4, 0(a0)
+; RV32-NEXT:    sw t2, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a6, 12(a0)
+; RV32-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shl_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 96
+; RV64-NEXT:    andi sp, sp, -32
+; RV64-NEXT:    ld a2, 0(a2)
+; RV64-NEXT:    ld a3, 0(a1)
+; RV64-NEXT:    ld a4, 8(a1)
+; RV64-NEXT:    ld a5, 16(a1)
+; RV64-NEXT:    ld a1, 24(a1)
+; RV64-NEXT:    sd zero, 0(sp)
+; RV64-NEXT:    sd zero, 8(sp)
+; RV64-NEXT:    sd zero, 16(sp)
+; RV64-NEXT:    sd zero, 24(sp)
+; RV64-NEXT:    addi a6, sp, 32
+; RV64-NEXT:    sd a3, 32(sp)
+; RV64-NEXT:    sd a4, 40(sp)
+; RV64-NEXT:    sd a5, 48(sp)
+; RV64-NEXT:    sd a1, 56(sp)
+; RV64-NEXT:    srli a1, a2, 3
+; RV64-NEXT:    andi a3, a2, 63
+; RV64-NEXT:    andi a1, a1, 24
+; RV64-NEXT:    sub a1, a6, a1
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a5, 8(a1)
+; RV64-NEXT:    ld a6, 16(a1)
+; RV64-NEXT:    ld a1, 24(a1)
+; RV64-NEXT:    xori a3, a3, 63
+; RV64-NEXT:    sll a7, a5, a2
+; RV64-NEXT:    srli t0, a4, 1
+; RV64-NEXT:    sll a1, a1, a2
+; RV64-NEXT:    sll a4, a4, a2
+; RV64-NEXT:    sll a2, a6, a2
+; RV64-NEXT:    srli a5, a5, 1
+; RV64-NEXT:    srli a6, a6, 1
+; RV64-NEXT:    srl t0, t0, a3
+; RV64-NEXT:    srl a5, a5, a3
+; RV64-NEXT:    srl a3, a6, a3
+; RV64-NEXT:    or a6, a7, t0
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    or a1, a1, a3
+; RV64-NEXT:    sd a4, 0(a0)
+; RV64-NEXT:    sd a6, 8(a0)
+; RV64-NEXT:    sd a2, 16(a0)
+; RV64-NEXT:    sd a1, 24(a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
+; RV64-NEXT:    ret
+  %r = shl i256 %a, %b
+  ret i256 %r
+}
+
+; i256 multiply
+define i256 @mul_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: mul_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -240
+; RV32-NEXT:    sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 16(a2)
+; RV32-NEXT:    sw a3, 172(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 20(a2)
+; RV32-NEXT:    sw a3, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 24(a2)
+; RV32-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 28(a2)
+; RV32-NEXT:    sw a3, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 16(a1)
+; RV32-NEXT:    sw a3, 164(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 20(a1)
+; RV32-NEXT:    sw a3, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 24(a1)
+; RV32-NEXT:    sw a3, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 28(a1)
+; RV32-NEXT:    sw a3, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s9, 0(a2)
+; RV32-NEXT:    lw s8, 4(a2)
+; RV32-NEXT:    lw a3, 8(a2)
+; RV32-NEXT:    sw a3, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    sw a2, 176(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s3, 0(a1)
+; RV32-NEXT:    lw s2, 4(a1)
+; RV32-NEXT:    lw s6, 8(a1)
+; RV32-NEXT:    lw s7, 12(a1)
+; RV32-NEXT:    sw a0, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, s3
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a0, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv a0, s2
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add s0, a0, s0
+; RV32-NEXT:    sltu a0, s0, a0
+; RV32-NEXT:    add s1, a1, a0
+; RV32-NEXT:    sw s3, 180(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, s3
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s8
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add s0, a0, s0
+; RV32-NEXT:    sw s0, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a0, s0, a0
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add s0, s1, a0
+; RV32-NEXT:    sw s2, 184(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, s2
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s8
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add s4, a0, s0
+; RV32-NEXT:    sltu a2, s0, s1
+; RV32-NEXT:    sltu a0, s4, a0
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add s1, a1, a0
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    mv s5, a1
+; RV32-NEXT:    mv a0, s7
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    sw s9, 160(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    add s5, a0, s5
+; RV32-NEXT:    sw s6, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    sw s8, 168(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a2, s8
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add a2, a0, s5
+; RV32-NEXT:    add s4, s3, s4
+; RV32-NEXT:    add s1, a2, s1
+; RV32-NEXT:    sltu s6, s4, s3
+; RV32-NEXT:    add s8, s1, s6
+; RV32-NEXT:    beq s8, a2, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu s6, s8, a2
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    sltu a3, s5, s2
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a3, s0, a3
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    sw a3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s0, a3, a0
+; RV32-NEXT:    sw s7, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, s7
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a0, s0
+; RV32-NEXT:    sw a0, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s6, a0, s6
+; RV32-NEXT:    lw s0, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw s1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a2, s1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    mv s7, a1
+; RV32-NEXT:    lw a0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    add s7, a0, s7
+; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add a2, a0, s7
+; RV32-NEXT:    add s4, s5, s4
+; RV32-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu s0, s4, s5
+; RV32-NEXT:    add a3, a2, s0
+; RV32-NEXT:    add a3, a3, s8
+; RV32-NEXT:    beq a3, a2, .LBB4_4
+; RV32-NEXT:  # %bb.3:
+; RV32-NEXT:    sltu s0, a3, a2
+; RV32-NEXT:  .LBB4_4:
+; RV32-NEXT:    sw a3, 100(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a3, s7, s3
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add s11, s2, a3
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add s10, s11, a0
+; RV32-NEXT:    lw a0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    sw a1, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s2, a0, s10
+; RV32-NEXT:    add s9, s2, s0
+; RV32-NEXT:    add s0, s6, s9
+; RV32-NEXT:    lw s8, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a0, s8
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw s1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a2, s1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    sw a1, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s7, a0, s0
+; RV32-NEXT:    lw a0, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    sw a1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a0, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a1, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a6, a0, s4
+; RV32-NEXT:    sw a6, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a7, s0, s6
+; RV32-NEXT:    lw a2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a0, s6, a2
+; RV32-NEXT:    lw a1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a1, a2, a1
+; RV32-NEXT:    lw a2, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a3, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a2, a3, a2
+; RV32-NEXT:    sltu a3, s9, s2
+; RV32-NEXT:    sltu a4, s2, s5
+; RV32-NEXT:    sltu a5, s10, s11
+; RV32-NEXT:    add a6, s7, a6
+; RV32-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a2, t0, a2
+; RV32-NEXT:    lw t0, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    sw a6, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu s9, a6, s7
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    add s4, a1, a0
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a3, s4, a3
+; RV32-NEXT:    sw a7, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s11, a3, a7
+; RV32-NEXT:    lw a0, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    sw a1, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s10, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s10, a0, s10
+; RV32-NEXT:    mv a0, s8
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s5, a0, s10
+; RV32-NEXT:    sltu s0, s7, s3
+; RV32-NEXT:    add s6, s5, s0
+; RV32-NEXT:    add s6, s6, s11
+; RV32-NEXT:    lw a0, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s7, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s7, a0, s7
+; RV32-NEXT:    lw a0, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw s3, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a2, s3
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s7, a0, s7
+; RV32-NEXT:    lw a0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s8, a0, s8
+; RV32-NEXT:    lw a0, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw a2, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw s8, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a0, s8
+; RV32-NEXT:    sw a0, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a0, s7
+; RV32-NEXT:    lw a2, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a3, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a3, a3, a2
+; RV32-NEXT:    add a4, a0, a3
+; RV32-NEXT:    sw s9, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a4, s9
+; RV32-NEXT:    add a2, s6, a0
+; RV32-NEXT:    sw a1, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    beq a2, s6, .LBB4_6
+; RV32-NEXT:  # %bb.5:
+; RV32-NEXT:    sltu a0, a2, s6
+; RV32-NEXT:    sw a0, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:  .LBB4_6:
+; RV32-NEXT:    beq s6, s5, .LBB4_8
+; RV32-NEXT:  # %bb.7:
+; RV32-NEXT:    sltu s0, s6, s5
+; RV32-NEXT:  .LBB4_8:
+; RV32-NEXT:    sw a4, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a2, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s6, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    beq s11, s4, .LBB4_10
+; RV32-NEXT:  # %bb.9:
+; RV32-NEXT:    sltu s6, s11, s4
+; RV32-NEXT:  .LBB4_10:
+; RV32-NEXT:    sltu a0, s10, s1
+; RV32-NEXT:    sltu a1, s5, s2
+; RV32-NEXT:    lw a2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s2, a0, a1
+; RV32-NEXT:    lw s5, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a0, s5
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    lw s1, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a2, s1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a0, s2
+; RV32-NEXT:    sw a0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add a0, a0, s6
+; RV32-NEXT:    sw a0, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s0, a0, s0
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a1, s3
+; RV32-NEXT:    lw a2, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a2, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a3, s4
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 160(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s1, a0, s0
+; RV32-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu s6, a1, a0
+; RV32-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a0, s7, a0
+; RV32-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s6, a1, s6
+; RV32-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add s8, s6, a0
+; RV32-NEXT:    mv a0, s4
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s3
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 176(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s0, a0, s8
+; RV32-NEXT:    sw s1, 168(sp) # 4-byte Folded Spill
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    lw a0, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a1, s9
+; RV32-NEXT:    lw a2, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a3, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a1, s5
+; RV32-NEXT:    lw a2, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT:    mv a3, s5
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    mv s10, a0
+; RV32-NEXT:    mv s11, a1
+; RV32-NEXT:    add s4, a0, s4
+; RV32-NEXT:    lw a0, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a1, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu s7, a1, a0
+; RV32-NEXT:    lw s2, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a0, s2, a0
+; RV32-NEXT:    lw a1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s7, a1, s7
+; RV32-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add s3, s7, a0
+; RV32-NEXT:    mv a0, s5
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3
+; RV32-NEXT:    add a7, a0, s3
+; RV32-NEXT:    add a6, a7, s4
+; RV32-NEXT:    add a2, a6, s1
+; RV32-NEXT:    lw t3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    beq a4, s2, .LBB4_12
+; RV32-NEXT:  # %bb.11:
+; RV32-NEXT:    sltu t3, a4, s2
+; RV32-NEXT:  .LBB4_12:
+; RV32-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a4, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a4, a4, a3
+; RV32-NEXT:    sltu a3, s8, s6
+; RV32-NEXT:    sltu t0, s3, s7
+; RV32-NEXT:    lw a5, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw t1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a5, t1, a5
+; RV32-NEXT:    lw t1, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw t2, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu t1, t2, t1
+; RV32-NEXT:    lw t2, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a3, t2, a3
+; RV32-NEXT:    lw t2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s11, s11, t2
+; RV32-NEXT:    sltu t2, s4, s10
+; RV32-NEXT:    add a1, a1, t0
+; RV32-NEXT:    add t0, a2, t3
+; RV32-NEXT:    lw t3, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw t4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu t3, t4, t3
+; RV32-NEXT:    lw t5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu t4, t5, t4
+; RV32-NEXT:    lw s3, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu t5, s3, t5
+; RV32-NEXT:    lw t6, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu t6, s0, t6
+; RV32-NEXT:    sltu s0, s1, s0
+; RV32-NEXT:    sltu a0, a7, a0
+; RV32-NEXT:    sltu a7, a6, a7
+; RV32-NEXT:    sltu a6, a2, a6
+; RV32-NEXT:    lw s2, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw s1, 0(s2)
+; RV32-NEXT:    lw s1, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw s1, 4(s2)
+; RV32-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw s1, 8(s2)
+; RV32-NEXT:    lw s1, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw s1, 12(s2)
+; RV32-NEXT:    add s1, s3, t0
+; RV32-NEXT:    lw s4, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a4, s4, a4
+; RV32-NEXT:    sltu a2, t0, a2
+; RV32-NEXT:    add a5, a5, t1
+; RV32-NEXT:    add t2, s11, t2
+; RV32-NEXT:    lw t0, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add t0, s1, t0
+; RV32-NEXT:    sltu t1, s1, s3
+; RV32-NEXT:    add a4, a4, t3
+; RV32-NEXT:    add a3, a3, t6
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a2, a6, a2
+; RV32-NEXT:    sltu a1, t0, s1
+; RV32-NEXT:    add a4, a4, t4
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    add a4, a4, t5
+; RV32-NEXT:    add a3, a3, s0
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    add a0, a0, t1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    lw a1, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw a1, 16(s2)
+; RV32-NEXT:    lw a1, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw a1, 20(s2)
+; RV32-NEXT:    sw t0, 24(s2)
+; RV32-NEXT:    sw a0, 28(s2)
+; RV32-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 240
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mul_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld s3, 0(a2)
+; RV64-NEXT:    ld s1, 8(a2)
+; RV64-NEXT:    ld s4, 16(a2)
+; RV64-NEXT:    ld s5, 24(a2)
+; RV64-NEXT:    ld s6, 0(a1)
+; RV64-NEXT:    ld s2, 8(a1)
+; RV64-NEXT:    ld a2, 16(a1)
+; RV64-NEXT:    ld a3, 24(a1)
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a0, s3
+; RV64-NEXT:    mv a1, s1
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    mv s7, a0
+; RV64-NEXT:    mv s8, a1
+; RV64-NEXT:    mv a0, s4
+; RV64-NEXT:    mv a1, s5
+; RV64-NEXT:    mv a2, s6
+; RV64-NEXT:    mv a3, s2
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    add a1, a1, s8
+; RV64-NEXT:    add s7, a0, s7
+; RV64-NEXT:    sltu a0, s7, a0
+; RV64-NEXT:    add s8, a1, a0
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    mv a2, s3
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    mv s4, a0
+; RV64-NEXT:    mv s5, a1
+; RV64-NEXT:    mv a0, s2
+; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    mv a2, s3
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    add s5, a0, s5
+; RV64-NEXT:    sltu a0, s5, a0
+; RV64-NEXT:    add s3, a1, a0
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    mv a2, s1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    add s5, a0, s5
+; RV64-NEXT:    sltu a0, s5, a0
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    add s6, s3, a0
+; RV64-NEXT:    mv a0, s2
+; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    mv a2, s1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __multi3
+; RV64-NEXT:    add a2, a0, s6
+; RV64-NEXT:    sltu a3, s6, s3
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    add s7, a2, s7
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    sltu a1, s7, a2
+; RV64-NEXT:    add a0, a0, s8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    sd s4, 0(s0)
+; RV64-NEXT:    sd s5, 8(s0)
+; RV64-NEXT:    sd s7, 16(s0)
+; RV64-NEXT:    sd a0, 24(s0)
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+  %r = mul i256 %a, %b
+  ret i256 %r
+}
+
+; i256 bitwise and
+define i256 @and_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: and_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 16(a2)
+; RV32-NEXT:    lw a4, 20(a2)
+; RV32-NEXT:    lw a5, 24(a2)
+; RV32-NEXT:    lw a6, 28(a2)
+; RV32-NEXT:    lw a7, 16(a1)
+; RV32-NEXT:    lw t0, 20(a1)
+; RV32-NEXT:    lw t1, 24(a1)
+; RV32-NEXT:    lw t2, 28(a1)
+; RV32-NEXT:    lw t3, 0(a2)
+; RV32-NEXT:    lw t4, 4(a2)
+; RV32-NEXT:    lw t5, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    lw t6, 0(a1)
+; RV32-NEXT:    lw s0, 4(a1)
+; RV32-NEXT:    lw s1, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    and t3, t6, t3
+; RV32-NEXT:    and t4, s0, t4
+; RV32-NEXT:    and t5, s1, t5
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a2, a7, a3
+; RV32-NEXT:    and a3, t0, a4
+; RV32-NEXT:    and a4, t1, a5
+; RV32-NEXT:    and a5, t2, a6
+; RV32-NEXT:    sw a2, 16(a0)
+; RV32-NEXT:    sw a3, 20(a0)
+; RV32-NEXT:    sw a4, 24(a0)
+; RV32-NEXT:    sw a5, 28(a0)
+; RV32-NEXT:    sw t3, 0(a0)
+; RV32-NEXT:    sw t4, 4(a0)
+; RV32-NEXT:    sw t5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: and_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a3, 0(a2)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld a5, 16(a2)
+; RV64-NEXT:    ld a2, 24(a2)
+; RV64-NEXT:    ld a6, 0(a1)
+; RV64-NEXT:    ld a7, 8(a1)
+; RV64-NEXT:    ld t0, 16(a1)
+; RV64-NEXT:    ld a1, 24(a1)
+; RV64-NEXT:    and a3, a6, a3
+; RV64-NEXT:    and a4, a7, a4
+; RV64-NEXT:    and a5, t0, a5
+; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    sd a4, 8(a0)
+; RV64-NEXT:    sd a5, 16(a0)
+; RV64-NEXT:    sd a1, 24(a0)
+; RV64-NEXT:    ret
+  %r = and i256 %a, %b
+  ret i256 %r
+}
+
+; i256 bitwise xor (key for Hamming distance)
+define i256 @xor_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: xor_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 16(a2)
+; RV32-NEXT:    lw a4, 20(a2)
+; RV32-NEXT:    lw a5, 24(a2)
+; RV32-NEXT:    lw a6, 28(a2)
+; RV32-NEXT:    lw a7, 16(a1)
+; RV32-NEXT:    lw t0, 20(a1)
+; RV32-NEXT:    lw t1, 24(a1)
+; RV32-NEXT:    lw t2, 28(a1)
+; RV32-NEXT:    lw t3, 0(a2)
+; RV32-NEXT:    lw t4, 4(a2)
+; RV32-NEXT:    lw t5, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    lw t6, 0(a1)
+; RV32-NEXT:    lw s0, 4(a1)
+; RV32-NEXT:    lw s1, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    xor t3, t6, t3
+; RV32-NEXT:    xor t4, s0, t4
+; RV32-NEXT:    xor t5, s1, t5
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    xor a2, a7, a3
+; RV32-NEXT:    xor a3, t0, a4
+; RV32-NEXT:    xor a4, t1, a5
+; RV32-NEXT:    xor a5, t2, a6
+; RV32-NEXT:    sw a2, 16(a0)
+; RV32-NEXT:    sw a3, 20(a0)
+; RV32-NEXT:    sw a4, 24(a0)
+; RV32-NEXT:    sw a5, 28(a0)
+; RV32-NEXT:    sw t3, 0(a0)
+; RV32-NEXT:    sw t4, 4(a0)
+; RV32-NEXT:    sw t5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: xor_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a3, 0(a2)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld a5, 16(a2)
+; RV64-NEXT:    ld a2, 24(a2)
+; RV64-NEXT:    ld a6, 0(a1)
+; RV64-NEXT:    ld a7, 8(a1)
+; RV64-NEXT:    ld t0, 16(a1)
+; RV64-NEXT:    ld a1, 24(a1)
+; RV64-NEXT:    xor a3, a6, a3
+; RV64-NEXT:    xor a4, a7, a4
+; RV64-NEXT:    xor a5, t0, a5
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    sd a4, 8(a0)
+; RV64-NEXT:    sd a5, 16(a0)
+; RV64-NEXT:    sd a1, 24(a0)
+; RV64-NEXT:    ret
+  %r = xor i256 %a, %b
+  ret i256 %r
+}
+
+; i256 popcount (Hamming weight)
+declare i256 @llvm.ctpop.i256(i256)
+define i256 @ctpop_i256(i256 %a) nounwind {
+; RV32-LABEL: ctpop_i256:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a3, 16(a1)
+; RV32-NEXT:    lw a5, 20(a1)
+; RV32-NEXT:    lw a7, 24(a1)
+; RV32-NEXT:    lw t0, 28(a1)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lui t1, 349525
+; RV32-NEXT:    addi t1, t1, 1365
+; RV32-NEXT:    srli t2, t0, 1
+; RV32-NEXT:    srli t3, a7, 1
+; RV32-NEXT:    srli t4, a5, 1
+; RV32-NEXT:    srli t5, a3, 1
+; RV32-NEXT:    srli t6, a1, 1
+; RV32-NEXT:    srli s0, a6, 1
+; RV32-NEXT:    srli s1, a4, 1
+; RV32-NEXT:    srli s2, a2, 1
+; RV32-NEXT:    and t2, t2, t1
+; RV32-NEXT:    and t3, t3, t1
+; RV32-NEXT:    and t4, t4, t1
+; RV32-NEXT:    and t5, t5, t1
+; RV32-NEXT:    and t6, t6, t1
+; RV32-NEXT:    and s0, s0, t1
+; RV32-NEXT:    and s1, s1, t1
+; RV32-NEXT:    and t1, s2, t1
+; RV32-NEXT:    lui s2, 209715
+; RV32-NEXT:    addi s2, s2, 819
+; RV32-NEXT:    sub t0, t0, t2
+; RV32-NEXT:    sub a7, a7, t3
+; RV32-NEXT:    sub a5, a5, t4
+; RV32-NEXT:    sub a3, a3, t5
+; RV32-NEXT:    sub a1, a1, t6
+; RV32-NEXT:    sub a6, a6, s0
+; RV32-NEXT:    sub a4, a4, s1
+; RV32-NEXT:    sub a2, a2, t1
+; RV32-NEXT:    and t1, t0, s2
+; RV32-NEXT:    srli t0, t0, 2
+; RV32-NEXT:    and t2, a7, s2
+; RV32-NEXT:    srli a7, a7, 2
+; RV32-NEXT:    and t3, a5, s2
+; RV32-NEXT:    srli a5, a5, 2
+; RV32-NEXT:    and t4, a3, s2
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and t5, a1, s2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    and t6, a6, s2
+; RV32-NEXT:    srli a6, a6, 2
+; RV32-NEXT:    and s0, a4, s2
+; RV32-NEXT:    srli a4, a4, 2
+; RV32-NEXT:    and s1, a2, s2
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and t0, t0, s2
+; RV32-NEXT:    and a7, a7, s2
+; RV32-NEXT:    and a5, a5, s2
+; RV32-NEXT:    and a3, a3, s2
+; RV32-NEXT:    and a1, a1, s2
+; RV32-NEXT:    and a6, a6, s2
+; RV32-NEXT:    and a4, a4, s2
+; RV32-NEXT:    and a2, a2, s2
+; RV32-NEXT:    add t0, t1, t0
+; RV32-NEXT:    add a7, t2, a7
+; RV32-NEXT:    add a5, t3, a5
+; RV32-NEXT:    add a3, t4, a3
+; RV32-NEXT:    add a1, t5, a1
+; RV32-NEXT:    lui t1, 61681
+; RV32-NEXT:    addi t1, t1, -241
+; RV32-NEXT:    add a6, t6, a6
+; RV32-NEXT:    add a4, s0, a4
+; RV32-NEXT:    add a2, s1, a2
+; RV32-NEXT:    srli t2, t0, 4
+; RV32-NEXT:    srli t3, a7, 4
+; RV32-NEXT:    srli t4, a5, 4
+; RV32-NEXT:    add t0, t0, t2
+; RV32-NEXT:    srli t2, a3, 4
+; RV32-NEXT:    add a7, a7, t3
+; RV32-NEXT:    srli t3, a1, 4
+; RV32-NEXT:    add a5, a5, t4
+; RV32-NEXT:    srli t4, a6, 4
+; RV32-NEXT:    add a3, a3, t2
+; RV32-NEXT:    srli t2, a4, 4
+; RV32-NEXT:    add a1, a1, t3
+; RV32-NEXT:    srli t3, a2, 4
+; RV32-NEXT:    add a6, a6, t4
+; RV32-NEXT:    add a4, a4, t2
+; RV32-NEXT:    add a2, a2, t3
+; RV32-NEXT:    and t0, t0, t1
+; RV32-NEXT:    and a7, a7, t1
+; RV32-NEXT:    and a5, a5, t1
+; RV32-NEXT:    and a3, a3, t1
+; RV32-NEXT:    and a1, a1, t1
+; RV32-NEXT:    and a6, a6, t1
+; RV32-NEXT:    and a4, a4, t1
+; RV32-NEXT:    and a2, a2, t1
+; RV32-NEXT:    slli t1, t0, 8
+; RV32-NEXT:    slli t2, a7, 8
+; RV32-NEXT:    slli t3, a5, 8
+; RV32-NEXT:    slli t4, a3, 8
+; RV32-NEXT:    add t0, t0, t1
+; RV32-NEXT:    slli t1, a1, 8
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    slli t2, a6, 8
+; RV32-NEXT:    add a5, a5, t3
+; RV32-NEXT:    slli t3, a4, 8
+; RV32-NEXT:    add a3, a3, t4
+; RV32-NEXT:    slli t4, a2, 8
+; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a6, a6, t2
+; RV32-NEXT:    add a4, a4, t3
+; RV32-NEXT:    add a2, a2, t4
+; RV32-NEXT:    slli t1, t0, 16
+; RV32-NEXT:    slli t2, a7, 16
+; RV32-NEXT:    slli t3, a5, 16
+; RV32-NEXT:    slli t4, a3, 16
+; RV32-NEXT:    add t0, t0, t1
+; RV32-NEXT:    slli t1, a1, 16
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    slli t2, a6, 16
+; RV32-NEXT:    add a5, a5, t3
+; RV32-NEXT:    slli t3, a4, 16
+; RV32-NEXT:    add a3, a3, t4
+; RV32-NEXT:    slli t4, a2, 16
+; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a6, a6, t2
+; RV32-NEXT:    add a4, a4, t3
+; RV32-NEXT:    add a2, a2, t4
+; RV32-NEXT:    srli t0, t0, 24
+; RV32-NEXT:    srli a7, a7, 24
+; RV32-NEXT:    srli a5, a5, 24
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    srli a1, a1, 24
+; RV32-NEXT:    srli a6, a6, 24
+; RV32-NEXT:    srli a4, a4, 24
+; RV32-NEXT:    srli a2, a2, 24
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a7, a3, a7
+; RV32-NEXT:    add a5, a2, a1
+; RV32-NEXT:    add a1, a5, a7
+; RV32-NEXT:    sltu a3, a7, a3
+; RV32-NEXT:    sltu a4, a5, a2
+; RV32-NEXT:    sltu a2, a1, a5
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a3, a3, a2
+; RV32-NEXT:    beq a3, a4, .LBB7_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a2, a3, a4
+; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:    sw zero, 16(a0)
+; RV32-NEXT:    sw zero, 20(a0)
+; RV32-NEXT:    sw zero, 24(a0)
+; RV32-NEXT:    sw zero, 28(a0)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    sw zero, 12(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctpop_i256:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    ld a3, 8(a1)
+; RV64-NEXT:    ld a4, 16(a1)
+; RV64-NEXT:    ld a5, 24(a1)
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 61681
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    addi a6, a6, 819
+; RV64-NEXT:    addi a7, a7, -241
+; RV64-NEXT:    slli t0, a1, 32
+; RV64-NEXT:    slli t1, a6, 32
+; RV64-NEXT:    slli t2, a7, 32
+; RV64-NEXT:    add t0, a1, t0
+; RV64-NEXT:    add a6, a6, t1
+; RV64-NEXT:    add a1, a7, t2
+; RV64-NEXT:    srli a7, a5, 1
+; RV64-NEXT:    srli t1, a4, 1
+; RV64-NEXT:    srli t2, a3, 1
+; RV64-NEXT:    srli t3, a2, 1
+; RV64-NEXT:    and a7, a7, t0
+; RV64-NEXT:    and t1, t1, t0
+; RV64-NEXT:    and t2, t2, t0
+; RV64-NEXT:    and t0, t3, t0
+; RV64-NEXT:    sub a5, a5, a7
+; RV64-NEXT:    sub a4, a4, t1
+; RV64-NEXT:    sub a3, a3, t2
+; RV64-NEXT:    sub a2, a2, t0
+; RV64-NEXT:    and a7, a5, a6
+; RV64-NEXT:    srli a5, a5, 2
+; RV64-NEXT:    and t0, a4, a6
+; RV64-NEXT:    srli a4, a4, 2
+; RV64-NEXT:    and t1, a3, a6
+; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    and t2, a2, a6
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a5, a5, a6
+; RV64-NEXT:    and a4, a4, a6
+; RV64-NEXT:    and a3, a3, a6
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    add a4, t0, a4
+; RV64-NEXT:    add a3, t1, a3
+; RV64-NEXT:    add a2, t2, a2
+; RV64-NEXT:    srli a6, a5, 4
+; RV64-NEXT:    srli a7, a4, 4
+; RV64-NEXT:    srli t0, a3, 4
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    srli a6, a2, 4
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    add a3, a3, t0
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    and a5, a5, a1
+; RV64-NEXT:    and a4, a4, a1
+; RV64-NEXT:    and a3, a3, a1
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    slli a2, a5, 8
+; RV64-NEXT:    slli a6, a4, 8
+; RV64-NEXT:    slli a7, a3, 8
+; RV64-NEXT:    slli t0, a1, 8
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    add a1, a1, t0
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    slli a6, a4, 16
+; RV64-NEXT:    slli a7, a3, 16
+; RV64-NEXT:    slli t0, a1, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    add a1, a1, t0
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    slli t0, a1, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    add a1, a1, t0
+; RV64-NEXT:    srli a2, a2, 56
+; RV64-NEXT:    srli a4, a4, 56
+; RV64-NEXT:    srli a3, a3, 56
+; RV64-NEXT:    srli a1, a1, 56
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    add a2, a1, a2
+; RV64-NEXT:    sltu a1, a2, a1
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sd a1, 8(a0)
+; RV64-NEXT:    sd zero, 16(a0)
+; RV64-NEXT:    sd zero, 24(a0)
+; RV64-NEXT:    ret
+  %r = call i256 @llvm.ctpop.i256(i256 %a)
+  ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index 0bb3b179cc305..26939c81ef535 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -6,1025 +6,638 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-LABEL: test_1024:
 ; EGPR:       # %bb.0:
 ; EGPR-NEXT:    pushq %rbp
+; EGPR-NEXT:    movq %rsp, %rbp
 ; EGPR-NEXT:    pushq %r15
 ; EGPR-NEXT:    pushq %r14
 ; EGPR-NEXT:    pushq %r13
 ; EGPR-NEXT:    pushq %r12
 ; EGPR-NEXT:    pushq %rbx
-; EGPR-NEXT:    subq $104, %rsp
+; EGPR-NEXT:    andq $-32, %rsp
+; EGPR-NEXT:    subq $1216, %rsp # imm = 0x4C0
 ; EGPR-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %rdi, %r26
-; EGPR-NEXT:    movq (%rdi), %r13
-; EGPR-NEXT:    movq 8(%rdi), %r18
-; EGPR-NEXT:    movq 24(%rdi), %r21
-; EGPR-NEXT:    movq 16(%rdi), %r17
-; EGPR-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NEXT:    movq 32(%r26), %r10
-; EGPR-NEXT:    movq 56(%r26), %r15
-; EGPR-NEXT:    movq 48(%r26), %r12
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq 24(%rsi), %r25
-; EGPR-NEXT:    movq 16(%rsi), %r11
-; EGPR-NEXT:    movq (%rsi), %r31
-; EGPR-NEXT:    movq 8(%rsi), %r14
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r22
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r22
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r22, %r16
-; EGPR-NEXT:    adcq %rcx, %r9
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r22
-; EGPR-NEXT:    movq %rax, %r27
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    addq %r22, %r24
-; EGPR-NEXT:    adcq $0, %r23
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r22
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r24, %r20
-; EGPR-NEXT:    adcq %r23, %r22
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    addq %r22, %r24
-; EGPR-NEXT:    adcq %rcx, %r23
-; EGPR-NEXT:    addq %r19, %r24
-; EGPR-NEXT:    adcq %r8, %r23
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq (%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 8(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 16(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 24(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 96(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 104(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 112(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 120(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 64(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 72(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 80(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 88(%rdi), %r13
+; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 32(%rdi), %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 40(%rdi), %rdx
+; EGPR-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 48(%rdi), %rcx
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 56(%rdi), %r8
+; EGPR-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 96(%rsi), %rdi
 ; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r19
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r8, %r22
-; EGPR-NEXT:    adcq $0, %r19
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r22, %r29
-; EGPR-NEXT:    adcq %r19, %rbx
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %rbx, %r8
-; EGPR-NEXT:    adcq %rcx, %r30
-; EGPR-NEXT:    addq %r24, %r28
-; EGPR-NEXT:    adcq %r23, %r29
-; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r30
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq 104(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 112(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 120(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq (%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 8(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 16(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 24(%rsi), %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 32(%rsi), %r12
 ; EGPR-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq 40(%rsi), %r15
 ; EGPR-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r23
-; EGPR-NEXT:    addq %r9, %r23
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rdi
-; EGPR-NEXT:    addq %r23, %rdi
-; EGPR-NEXT:    adcq %r16, %r9
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %r10d
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    addq %r9, %r24
-; EGPR-NEXT:    adcq %r10, %r23
-; EGPR-NEXT:    addq %r8, %rsi
-; EGPR-NEXT:    movq %rsi, %r19
-; EGPR-NEXT:    adcq %r30, %rdi
-; EGPR-NEXT:    adcq %rcx, %r24
-; EGPR-NEXT:    adcq $0, %r23
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r16, %r30
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %al
-; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %r14, %rsi
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq %rcx, %r9
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r14
-; EGPR-NEXT:    movq %rax, %r15
-; EGPR-NEXT:    addq %r8, %r15
-; EGPR-NEXT:    adcq $0, %r14
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %rsi
-; EGPR-NEXT:    movq %rdx, %r12
-; EGPR-NEXT:    addq %r15, %rax
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r14, %r12
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %rsi
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r15
-; EGPR-NEXT:    addq %r12, %r15
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r8
-; EGPR-NEXT:    addq %rbx, %r15
-; EGPR-NEXT:    adcq %r30, %r8
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    addq %r30, %r14
-; EGPR-NEXT:    adcq $0, %rbx
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r12
-; EGPR-NEXT:    addq %r14, %rax
-; EGPR-NEXT:    movq %rax, %r10
-; EGPR-NEXT:    adcq %rbx, %r12
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r14
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r12, %r30
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r14
-; EGPR-NEXT:    addq %r15, %rsi
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r8, %r10
+; EGPR-NEXT:    movq 48(%rsi), %rbx
+; EGPR-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 56(%rsi), %r14
+; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 64(%rsi), %r9
+; EGPR-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 72(%rsi), %r16
+; EGPR-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 80(%rsi), %r10
 ; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    adcq $0, %r14
-; EGPR-NEXT:    addq %r16, %r30
-; EGPR-NEXT:    adcq %r9, %r14
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    movq 88(%rsi), %r11
 ; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r15
-; EGPR-NEXT:    addq %r16, %r15
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %r9b
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r12
-; EGPR-NEXT:    movq %rax, %rbp
-; EGPR-NEXT:    addq %r8, %rbp
-; EGPR-NEXT:    movzbl %r9b, %eax
-; EGPR-NEXT:    adcq %rax, %r12
-; EGPR-NEXT:    addq %r30, %rbx
-; EGPR-NEXT:    adcq %r14, %r15
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %rbp
-; EGPR-NEXT:    adcq $0, %r12
-; EGPR-NEXT:    addq %r27, %rbx
+; EGPR-NEXT:    subq $8, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    pushq %r11
+; EGPR-NEXT:    pushq %r10
+; EGPR-NEXT:    pushq %r16
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq 32(%rsi), %r27
-; EGPR-NEXT:    adcq %r20, %r15
-; EGPR-NEXT:    adcq %r28, %rbp
-; EGPR-NEXT:    adcq %r29, %r12
-; EGPR-NEXT:    adcq $0, %r19
-; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %rdi
-; EGPR-NEXT:    adcq $0, %r24
-; EGPR-NEXT:    adcq $0, %r23
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 40(%rsi), %rcx
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r16, %r30
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    movzbl %r10b, %eax
-; EGPR-NEXT:    adcq %rax, %r9
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r28
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r8, %r29
-; EGPR-NEXT:    adcq $0, %r28
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r29, %r22
-; EGPR-NEXT:    adcq %r28, %r8
-; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r28
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r8, %r29
-; EGPR-NEXT:    movzbl %r10b, %eax
-; EGPR-NEXT:    adcq %rax, %r28
-; EGPR-NEXT:    addq %r20, %r29
-; EGPR-NEXT:    adcq %r30, %r28
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 48(%rsi), %r20
-; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r11
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    addq %r8, %r14
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    movq 56(%rsi), %r10
-; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r13
-; EGPR-NEXT:    addq %r14, %rax
-; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    adcq %r30, %r13
-; EGPR-NEXT:    setb %sil
-; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r13, %r8
-; EGPR-NEXT:    movzbl %sil, %eax
-; EGPR-NEXT:    adcq %rax, %r30
-; EGPR-NEXT:    addq %r29, %r11
-; EGPR-NEXT:    adcq %r28, %r14
-; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r30
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r9, %r29
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r17
-; EGPR-NEXT:    addq %r29, %r17
-; EGPR-NEXT:    adcq %r16, %r9
-; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r13
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r9, %r29
-; EGPR-NEXT:    movzbl %r16b, %eax
-; EGPR-NEXT:    adcq %rax, %r13
-; EGPR-NEXT:    addq %r8, %r28
-; EGPR-NEXT:    adcq %r30, %r17
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r29
-; EGPR-NEXT:    adcq $0, %r13
-; EGPR-NEXT:    addq %rbx, %r19
-; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r15, %r22
-; EGPR-NEXT:    movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %rbp, %r11
-; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r12, %r14
-; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r28
-; EGPR-NEXT:    adcq $0, %r17
-; EGPR-NEXT:    adcq $0, %r29
-; EGPR-NEXT:    adcq $0, %r13
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq %rdi, %r17
-; EGPR-NEXT:    adcq %r24, %r29
-; EGPR-NEXT:    adcq %r23, %r13
-; EGPR-NEXT:    setb %r15b
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %r8
+; EGPR-NEXT:    movq %r12, %r9
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    pushq %rbx
+; EGPR-NEXT:    pushq %r15
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    movq %rsi, %r21
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r16, %r22
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    movq %r23, %r14
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rdi
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    addq %r8, %r24
-; EGPR-NEXT:    adcq $0, %r23
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    addq %r24, %rax
-; EGPR-NEXT:    movq %rax, %r11
-; EGPR-NEXT:    adcq %r23, %r8
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    movq %rsi, %r23
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r24
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r8, %r30
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r24
-; EGPR-NEXT:    addq %r19, %r30
-; EGPR-NEXT:    adcq %r22, %r24
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r13, %r9
+; EGPR-NEXT:    movq %r13, %r12
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    movq %r15, %rbx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r12, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r13, %rsi
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    movq %r14, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %rcx
+; EGPR-NEXT:    movq %r15, %r8
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r13, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r15
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT:    movq %r12, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r13, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %rbx
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r12, %rsi
+; EGPR-NEXT:    movq %r15, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r14, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT:    pushq %r12
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r14, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r12
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    movq %r14, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT:    movq %rbx, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r14, %rsi
+; EGPR-NEXT:    movq %rbx, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    movq %r14, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT:    pushq %rbx
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT:    movq %r13, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %r14, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %rbx
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq %r13, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT:    pushq %r13
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r13
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT:    movq %rbx, %r9
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq %r13
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %r15, %rsi
+; EGPR-NEXT:    movq %r12, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %rbx, %rsi
+; EGPR-NEXT:    movq %r14, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $24, %rsp
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq %rbx, %rsi
+; EGPR-NEXT:    movq %r14, %rdx
+; EGPR-NEXT:    xorl %ecx, %ecx
+; EGPR-NEXT:    xorl %r8d, %r8d
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq $0
+; EGPR-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT:    callq __multi5 at PLT
+; EGPR-NEXT:    addq $32, %rsp
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    adcq $0, %rcx
+; EGPR-NEXT:    adcq $0, %rax
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
 ; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r11
+; EGPR-NEXT:    addq %rcx, %r16
+; EGPR-NEXT:    adcq %rax, %r11
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %r17d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT:    adcq $0, %r18
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT:    adcq $0, %r10
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r19
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r8, %r22
-; EGPR-NEXT:    adcq $0, %r19
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    addq %r22, %rax
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    adcq %r19, %rbx
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %rbx, %r8
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r23
-; EGPR-NEXT:    addq %r30, %rsi
-; EGPR-NEXT:    adcq %r24, %r22
-; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r23
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r23
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r20
-; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    addq %r9, %r19
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT:    adcq $0, %rdx
+; EGPR-NEXT:    adcq $0, %rsi
+; EGPR-NEXT:    addq %r10, %rdx
+; EGPR-NEXT:    adcq %r9, %rsi
+; EGPR-NEXT:    setb %r9b
+; EGPR-NEXT:    movzbl %r9b, %r19d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT:    adcq $0, %r22
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT:    adcq %r8, %r19
+; EGPR-NEXT:    adcq %rdi, %r22
 ; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    addq %r19, %rax
-; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    adcq %r16, %r9
-; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %rbp
-; EGPR-NEXT:    movq %rax, %r12
-; EGPR-NEXT:    addq %r9, %r12
-; EGPR-NEXT:    movzbl %r16b, %eax
-; EGPR-NEXT:    adcq %rax, %rbp
-; EGPR-NEXT:    addq %r8, %r24
-; EGPR-NEXT:    adcq %r23, %r19
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r12
-; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    addq %r28, %rdi
-; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r17, %r11
-; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r29, %rsi
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r13, %r22
-; EGPR-NEXT:    movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movzbl %r15b, %eax
-; EGPR-NEXT:    adcq %rax, %r24
-; EGPR-NEXT:    movq %r24, (%rsp) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r19
-; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r12
-; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    movq 64(%r26), %r23
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    movq %r25, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r11
+; EGPR-NEXT:    adcq $0, %r17
+; EGPR-NEXT:    adcq $0, %r18
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    adcq $0, %r24
+; EGPR-NEXT:    adcq $0, %r23
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT:    adcq $0, %r10
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 72(%r26), %r28
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r28
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r16, %r30
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r25, %rax
-; EGPR-NEXT:    mulq %r28
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r9
-; EGPR-NEXT:    movq %r31, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r29
-; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    addq %r8, %rbx
-; EGPR-NEXT:    adcq $0, %r29
-; EGPR-NEXT:    movq %r31, %rax
-; EGPR-NEXT:    mulq %r28
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    addq %rbx, %rax
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r29, %r8
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r28
-; EGPR-NEXT:    movq %rdx, %r29
-; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    addq %r8, %rbx
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r29
-; EGPR-NEXT:    addq %r24, %rbx
-; EGPR-NEXT:    adcq %r30, %r29
-; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    addq %r24, %r10
+; EGPR-NEXT:    adcq %r23, %r9
+; EGPR-NEXT:    setb %r23b
+; EGPR-NEXT:    movzbl %r23b, %r23d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT:    adcq $0, %r24
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT:    adcq %r19, %rdi
+; EGPR-NEXT:    adcq %r22, %r8
+; EGPR-NEXT:    adcq $0, %r10
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 80(%r26), %r13
-; EGPR-NEXT:    movq %r31, %rax
-; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    addq %r8, %r14
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    movq 88(%r26), %r18
-; EGPR-NEXT:    movq %r31, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r15
-; EGPR-NEXT:    movq %rax, %r24
-; EGPR-NEXT:    addq %r14, %r24
-; EGPR-NEXT:    adcq %r30, %r15
-; EGPR-NEXT:    setb %r14b
-; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r15, %r8
-; EGPR-NEXT:    movzbl %r14b, %eax
-; EGPR-NEXT:    adcq %rax, %r30
-; EGPR-NEXT:    addq %rbx, %rsi
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r29, %r24
-; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r30
-; EGPR-NEXT:    setb %r29b
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r25, %rax
-; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    addq %r9, %r14
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    addq %r14, %rbx
-; EGPR-NEXT:    adcq %r16, %r9
-; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r25, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r14
-; EGPR-NEXT:    movq %rax, %r15
-; EGPR-NEXT:    addq %r9, %r15
-; EGPR-NEXT:    movzbl %r16b, %eax
-; EGPR-NEXT:    adcq %rax, %r14
-; EGPR-NEXT:    addq %r8, %rsi
-; EGPR-NEXT:    adcq %r30, %rbx
-; EGPR-NEXT:    movzbl %r29b, %eax
-; EGPR-NEXT:    adcq %rax, %r15
+; EGPR-NEXT:    adcq $0, %r23
+; EGPR-NEXT:    adcq $0, %r24
+; EGPR-NEXT:    addq %r16, %r10
+; EGPR-NEXT:    adcq %r11, %r9
+; EGPR-NEXT:    adcq %r17, %r23
+; EGPR-NEXT:    adcq %r18, %r24
+; EGPR-NEXT:    setb %r11b
+; EGPR-NEXT:    movzbl %r11b, %r25d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT:    adcq $0, %r22
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT:    adcq $0, %r18
+; EGPR-NEXT:    adcq $0, %r17
+; EGPR-NEXT:    addq %r22, %r18
+; EGPR-NEXT:    adcq %r19, %r17
+; EGPR-NEXT:    setb %r19b
+; EGPR-NEXT:    movzbl %r19b, %r19d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT:    adcq $0, %r22
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT:    adcq %r23, %r16
+; EGPR-NEXT:    adcq %r24, %r11
+; EGPR-NEXT:    adcq %r25, %r18
+; EGPR-NEXT:    adcq $0, %r17
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    adcq $0, %r22
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r25
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    adcq $0, %r28
+; EGPR-NEXT:    addq %r26, %r21
+; EGPR-NEXT:    adcq %r25, %r28
+; EGPR-NEXT:    setb %r25b
+; EGPR-NEXT:    movzbl %r25b, %r25d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r27
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r30
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r20
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r31
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r30
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r27
+; EGPR-NEXT:    addq %r21, %r20
+; EGPR-NEXT:    adcq %r28, %r31
+; EGPR-NEXT:    adcq %r25, %r30
+; EGPR-NEXT:    adcq %r26, %r27
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT:    adcq $0, %r28
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT:    adcq $0, %rbx
 ; EGPR-NEXT:    adcq $0, %r14
-; EGPR-NEXT:    imulq %r27, %r18
-; EGPR-NEXT:    movq %r27, %rax
-; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r18, %rdx
-; EGPR-NEXT:    imulq %rcx, %r13
-; EGPR-NEXT:    addq %rdx, %r13
-; EGPR-NEXT:    movq %r20, %r9
-; EGPR-NEXT:    imulq %r28, %r9
-; EGPR-NEXT:    movq %r20, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r9, %rdx
-; EGPR-NEXT:    imulq %r23, %r10
-; EGPR-NEXT:    addq %rdx, %r10
-; EGPR-NEXT:    addq %r8, %r30
-; EGPR-NEXT:    adcq %r13, %r10
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r9
-; EGPR-NEXT:    movq %r28, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r27
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r8, %r20
-; EGPR-NEXT:    adcq $0, %r27
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r20, %r16
-; EGPR-NEXT:    adcq %r27, %r8
-; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r28, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r23
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r8, %r20
-; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r23
-; EGPR-NEXT:    addq %r30, %r20
-; EGPR-NEXT:    adcq %r10, %r23
-; EGPR-NEXT:    movq 112(%r26), %rcx
-; EGPR-NEXT:    movq %r31, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    imulq %r11, %rcx
-; EGPR-NEXT:    addq %rdx, %rcx
-; EGPR-NEXT:    movq 120(%r26), %rax
-; EGPR-NEXT:    imulq %r31, %rax
-; EGPR-NEXT:    addq %rax, %rcx
-; EGPR-NEXT:    movq 96(%r26), %r27
-; EGPR-NEXT:    movq 104(%r26), %r30
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    imulq %r30, %rdi
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rax, %r21
-; EGPR-NEXT:    addq %rdi, %rdx
-; EGPR-NEXT:    imulq %r27, %r25
-; EGPR-NEXT:    addq %rdx, %r25
-; EGPR-NEXT:    addq %r8, %r21
-; EGPR-NEXT:    adcq %rcx, %r25
-; EGPR-NEXT:    movq %r27, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    movq %r30, %rax
-; EGPR-NEXT:    mulq %r31
-; EGPR-NEXT:    movq %rdx, %r31
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    addq %r8, %r28
-; EGPR-NEXT:    adcq $0, %r31
-; EGPR-NEXT:    movq %r27, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r27
-; EGPR-NEXT:    addq %r28, %r27
-; EGPR-NEXT:    adcq %r31, %r8
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r30, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r26
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r8, %r31
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r26
-; EGPR-NEXT:    addq %r21, %r31
-; EGPR-NEXT:    adcq %r25, %r26
-; EGPR-NEXT:    addq %r9, %r22
-; EGPR-NEXT:    adcq %r16, %r27
-; EGPR-NEXT:    adcq %r20, %r31
+; EGPR-NEXT:    addq %r28, %rbx
+; EGPR-NEXT:    adcq %r21, %r14
+; EGPR-NEXT:    setb %r21b
+; EGPR-NEXT:    movzbl %r21b, %r15d
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
+; EGPR-NEXT:    adcq $0, %r12
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r29
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r29
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r13
+; EGPR-NEXT:    addq %rbx, %r21
+; EGPR-NEXT:    adcq %r14, %r28
+; EGPR-NEXT:    adcq %r15, %r29
+; EGPR-NEXT:    adcq %r12, %r13
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT:    addq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT:    adcq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT:    adcq %r24, %r25
 ; EGPR-NEXT:    adcq %r23, %r26
-; EGPR-NEXT:    addq %rsi, %r22
-; EGPR-NEXT:    adcq %rbx, %r27
-; EGPR-NEXT:    adcq %r15, %r31
-; EGPR-NEXT:    adcq %r14, %r26
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT:    movq 80(%r11), %rbx
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rax, %r23
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq 88(%r11), %r20
-; EGPR-NEXT:    movq %r20, %rax
-; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
-; EGPR-NEXT:    mulq %r17
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r16, %r30
-; EGPR-NEXT:    adcq %r9, %r8
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r20, %rax
-; EGPR-NEXT:    mulq %r17
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r16
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r9
-; EGPR-NEXT:    movq 64(%r11), %r15
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rax, %r25
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq 72(%r11), %r14
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rdx, %r28
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r8, %r29
-; EGPR-NEXT:    adcq $0, %r28
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r17
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r21
-; EGPR-NEXT:    addq %r29, %r21
-; EGPR-NEXT:    adcq %r28, %r8
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r17
-; EGPR-NEXT:    movq %rdx, %r29
-; EGPR-NEXT:    movq %rax, %r13
-; EGPR-NEXT:    addq %r8, %r13
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r29
-; EGPR-NEXT:    addq %r23, %r13
+; EGPR-NEXT:    adcq %r20, %r21
+; EGPR-NEXT:    adcq %r31, %r28
 ; EGPR-NEXT:    adcq %r30, %r29
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %rcx
-; EGPR-NEXT:    addq %r8, %rcx
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r10
-; EGPR-NEXT:    movq %rax, %r23
-; EGPR-NEXT:    addq %rcx, %r23
-; EGPR-NEXT:    adcq %r30, %r10
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r10, %r8
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r30
-; EGPR-NEXT:    addq %r13, %r28
-; EGPR-NEXT:    adcq %r29, %r23
-; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r30
-; EGPR-NEXT:    setb %sil
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    movq %r20, %rax
-; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r10
-; EGPR-NEXT:    addq %rcx, %r10
-; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r13
-; EGPR-NEXT:    addq %r10, %r13
-; EGPR-NEXT:    adcq %r9, %rcx
-; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r20, %rax
-; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r9
-; EGPR-NEXT:    addq %rcx, %r9
-; EGPR-NEXT:    movzbl %r10b, %eax
-; EGPR-NEXT:    adcq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r29
-; EGPR-NEXT:    adcq %r30, %r13
-; EGPR-NEXT:    movzbl %sil, %eax
-; EGPR-NEXT:    adcq %rax, %r9
-; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq 96(%r11), %rcx
-; EGPR-NEXT:    imulq %rcx, %r18
-; EGPR-NEXT:    movq %rcx, %rax
-; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    addq %r18, %rdx
-; EGPR-NEXT:    movq 104(%r11), %r30
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    imulq %r30, %rax
-; EGPR-NEXT:    addq %rdx, %rax
-; EGPR-NEXT:    movq %rax, %r10
-; EGPR-NEXT:    movq 112(%r11), %rax
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    imulq %r17, %rsi
-; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rax, %rdi
-; EGPR-NEXT:    addq %rsi, %rdx
-; EGPR-NEXT:    movq 120(%r11), %r18
-; EGPR-NEXT:    imulq %r19, %r18
-; EGPR-NEXT:    addq %rdx, %r18
-; EGPR-NEXT:    addq %r8, %rdi
-; EGPR-NEXT:    adcq %r10, %r18
-; EGPR-NEXT:    movq %r19, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r10
-; EGPR-NEXT:    addq %r8, %r10
-; EGPR-NEXT:    adcq $0, %rcx
-; EGPR-NEXT:    movq %r19, %rax
-; EGPR-NEXT:    mulq %r30
-; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r11
-; EGPR-NEXT:    addq %r10, %r11
-; EGPR-NEXT:    adcq %rcx, %r8
-; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r30
-; EGPR-NEXT:    movq %rdx, %r10
-; EGPR-NEXT:    movq %rax, %r17
-; EGPR-NEXT:    addq %r8, %r17
-; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r10
-; EGPR-NEXT:    addq %rdi, %r17
-; EGPR-NEXT:    adcq %r18, %r10
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT:    imulq %r15, %rdi
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; EGPR-NEXT:    mulq %r8
-; EGPR-NEXT:    movq %rax, %rcx
-; EGPR-NEXT:    addq %rdi, %rdx
-; EGPR-NEXT:    movq %r8, %rax
-; EGPR-NEXT:    imulq %r14, %rax
-; EGPR-NEXT:    addq %rdx, %rax
-; EGPR-NEXT:    movq %rax, %r18
-; EGPR-NEXT:    movq %rbx, %rdi
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; EGPR-NEXT:    imulq %r19, %rdi
-; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; EGPR-NEXT:    mulq %r8
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %rdi, %rdx
-; EGPR-NEXT:    imulq %r8, %r20
-; EGPR-NEXT:    addq %rdx, %r20
-; EGPR-NEXT:    addq %rcx, %r30
-; EGPR-NEXT:    adcq %r18, %r20
-; EGPR-NEXT:    movq %r8, %rax
-; EGPR-NEXT:    movq %r8, %rdi
-; EGPR-NEXT:    mulq %r15
-; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    movq %r19, %rax
-; EGPR-NEXT:    mulq %r15
-; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    movq %rax, %r15
-; EGPR-NEXT:    addq %rcx, %r15
-; EGPR-NEXT:    adcq $0, %rbx
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r18
-; EGPR-NEXT:    addq %r15, %r18
-; EGPR-NEXT:    adcq %rbx, %rcx
-; EGPR-NEXT:    setb %dil
-; EGPR-NEXT:    movq %r19, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    addq %rcx, %rax
-; EGPR-NEXT:    movzbl %dil, %ecx
-; EGPR-NEXT:    adcq %rcx, %rdx
-; EGPR-NEXT:    addq %r30, %rax
-; EGPR-NEXT:    adcq %r20, %rdx
-; EGPR-NEXT:    addq %rsi, %r8
-; EGPR-NEXT:    adcq %r11, %r18
-; EGPR-NEXT:    adcq %r17, %rax
-; EGPR-NEXT:    adcq %r10, %rdx
-; EGPR-NEXT:    addq %r29, %r8
-; EGPR-NEXT:    adcq %r13, %r18
-; EGPR-NEXT:    adcq %r9, %rax
-; EGPR-NEXT:    adcq %r16, %rdx
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq %r24, %r23
-; EGPR-NEXT:    adcq %r22, %r8
-; EGPR-NEXT:    adcq %r27, %r18
-; EGPR-NEXT:    adcq %r31, %rax
-; EGPR-NEXT:    adcq %r26, %rdx
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq %r12, %rax
-; EGPR-NEXT:    adcq %rbp, %rdx
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, (%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 8(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 16(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 24(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 32(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 40(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 48(%rcx)
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq %rsi, 56(%rcx)
-; EGPR-NEXT:    movq %r25, 64(%rcx)
-; EGPR-NEXT:    movq %r21, 72(%rcx)
-; EGPR-NEXT:    movq %r28, 80(%rcx)
-; EGPR-NEXT:    movq %r23, 88(%rcx)
-; EGPR-NEXT:    movq %r8, 96(%rcx)
-; EGPR-NEXT:    movq %r18, 104(%rcx)
-; EGPR-NEXT:    movq %rax, 112(%rcx)
-; EGPR-NEXT:    movq %rdx, 120(%rcx)
-; EGPR-NEXT:    addq $104, %rsp
+; EGPR-NEXT:    adcq %r27, %r13
+; EGPR-NEXT:    addq %r10, %rbx
+; EGPR-NEXT:    adcq %r9, %r14
+; EGPR-NEXT:    adcq %r16, %r25
+; EGPR-NEXT:    adcq %r11, %r26
+; EGPR-NEXT:    adcq %r18, %r21
+; EGPR-NEXT:    adcq %r17, %r28
+; EGPR-NEXT:    adcq %r19, %r29
+; EGPR-NEXT:    adcq %r22, %r13
+; EGPR-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT:    movq %rax, 16(%r9)
+; EGPR-NEXT:    movq %rcx, 24(%r9)
+; EGPR-NEXT:    movq %rdx, 32(%r9)
+; EGPR-NEXT:    movq %rsi, 40(%r9)
+; EGPR-NEXT:    movq %rdi, 48(%r9)
+; EGPR-NEXT:    movq %r8, 56(%r9)
+; EGPR-NEXT:    movq %rbx, 64(%r9)
+; EGPR-NEXT:    movq %r14, 72(%r9)
+; EGPR-NEXT:    movq %r25, 80(%r9)
+; EGPR-NEXT:    movq %r26, 88(%r9)
+; EGPR-NEXT:    movq %r21, 96(%r9)
+; EGPR-NEXT:    movq %r28, 104(%r9)
+; EGPR-NEXT:    movq %r29, 112(%r9)
+; EGPR-NEXT:    movq %r13, 120(%r9)
+; EGPR-NEXT:    movaps %xmm0, (%r9)
+; EGPR-NEXT:    leaq -40(%rbp), %rsp
 ; EGPR-NEXT:    popq %rbx
 ; EGPR-NEXT:    popq %r12
 ; EGPR-NEXT:    popq %r13
@@ -1036,845 +649,608 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-LABEL: test_1024:
 ; EGPR-NDD:       # %bb.0:
 ; EGPR-NDD-NEXT:    pushq %rbp
+; EGPR-NDD-NEXT:    movq %rsp, %rbp
 ; EGPR-NDD-NEXT:    pushq %r15
 ; EGPR-NDD-NEXT:    pushq %r14
 ; EGPR-NDD-NEXT:    pushq %r13
 ; EGPR-NDD-NEXT:    pushq %r12
 ; EGPR-NDD-NEXT:    pushq %rbx
-; EGPR-NDD-NEXT:    subq $96, %rsp
+; EGPR-NDD-NEXT:    andq $-32, %rsp
+; EGPR-NDD-NEXT:    subq $1216, %rsp # imm = 0x4C0
 ; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %rsi, %r15
-; EGPR-NDD-NEXT:    movq %rdi, %r22
-; EGPR-NDD-NEXT:    movq (%rdi), %r17
-; EGPR-NDD-NEXT:    movq 8(%rdi), %r11
-; EGPR-NDD-NEXT:    movq 24(%rdi), %r9
-; EGPR-NDD-NEXT:    movq 16(%rdi), %r10
-; EGPR-NDD-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NDD-NEXT:    movq 32(%r22), %r16
-; EGPR-NDD-NEXT:    movq 56(%r22), %r18
-; EGPR-NDD-NEXT:    movq 48(%r22), %r25
-; EGPR-NDD-NEXT:    movq 24(%rsi), %r14
-; EGPR-NDD-NEXT:    movq 16(%rsi), %r26
-; EGPR-NDD-NEXT:    movq (%rsi), %r24
-; EGPR-NDD-NEXT:    movq 8(%rsi), %r23
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r27
-; EGPR-NDD-NEXT:    movq %rax, %r19
-; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r27, %rax, %rsi
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r8d
-; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rcx, %rax, %r31
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r27
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %r30, %rax, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r30
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq %rdx, %r30
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r20d
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r30, %rax
-; EGPR-NDD-NEXT:    adcq %r20, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r19, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi, %r21
-; EGPR-NDD-NEXT:    adcq $0, %r31
-; EGPR-NDD-NEXT:    adcq $0, %r8
-; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r30
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r28d
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rsi, %rax
-; EGPR-NDD-NEXT:    adcq %r28, %rdx
-; EGPR-NDD-NEXT:    addq %r20, %r30, %rsi
-; EGPR-NDD-NEXT:    adcq %r21, %r19, %r20
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r31
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r29d
-; EGPR-NDD-NEXT:    movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r30
-; EGPR-NDD-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r21
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq %rdx, %r21
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r28d
-; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %r21, %rax
-; EGPR-NDD-NEXT:    adcq %r28, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %r30, %r21
-; EGPR-NDD-NEXT:    adcq %r8, %r19, %r28
-; EGPR-NDD-NEXT:    adcq %rax, %r29
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r30
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r31d
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq %r31, %rdx, %rbx
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r31
+; EGPR-NDD-NEXT:    movq (%rdi), %rax
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r31
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r31, %rax
+; EGPR-NDD-NEXT:    movq 8(%rdi), %rax
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rdx, %r12
-; EGPR-NDD-NEXT:    setb %r31b
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r12, %rax
-; EGPR-NDD-NEXT:    movzbl %r31b, %r31d
-; EGPR-NDD-NEXT:    adcq %r31, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r30, %r12
-; EGPR-NDD-NEXT:    adcq %rdx, %r19
-; EGPR-NDD-NEXT:    adcq $0, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rbx
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r13
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq %rdx, %r13
-; EGPR-NDD-NEXT:    setb %bpl
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %r13, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r13d
-; EGPR-NDD-NEXT:    adcq %r13, %rdx
-; EGPR-NDD-NEXT:    addq %r12, %r31
-; EGPR-NDD-NEXT:    movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r30, %r19
-; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq %rdx, %rbx
-; EGPR-NDD-NEXT:    setb %r19b
-; EGPR-NDD-NEXT:    movq %r10, %r16
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq %rdx, %r12
-; EGPR-NDD-NEXT:    setb %bpl
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %r12, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
-; EGPR-NDD-NEXT:    adcq %r12, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %r8
-; EGPR-NDD-NEXT:    adcq %r30, %rbx
-; EGPR-NDD-NEXT:    movzbl %r19b, %r19d
-; EGPR-NDD-NEXT:    adcq %r19, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %r8, %r27, %r12
-; EGPR-NDD-NEXT:    movq 32(%r15), %r30
-; EGPR-NDD-NEXT:    adcq %rbx, %rcx, %r13
-; EGPR-NDD-NEXT:    adcq %rax, %rsi, %rbp
-; EGPR-NDD-NEXT:    adcq %rdx, %r20, %rbx
-; EGPR-NDD-NEXT:    adcq $0, %r21
-; EGPR-NDD-NEXT:    movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %r28
-; EGPR-NDD-NEXT:    adcq $0, %r29
-; EGPR-NDD-NEXT:    adcq $0, %rdi
+; EGPR-NDD-NEXT:    movq 16(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 24(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 96(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 104(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 112(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 120(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 64(%rdi), %r15
+; EGPR-NDD-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 72(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 80(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 88(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 32(%rdi), %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 40(%rdi), %rdx
+; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 48(%rdi), %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 56(%rdi), %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 96(%rsi), %rdi
 ; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r27
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r9, %r19
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq 40(%r15), %r18
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r27, %rax, %r21
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %r8b
-; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rcx, %rax, %rdi
-; EGPR-NDD-NEXT:    movzbl %r8b, %eax
-; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r20
-; EGPR-NDD-NEXT:    movq %rax, %r27
-; EGPR-NDD-NEXT:    movq %r11, %r10
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %r20, %rax, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r20
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r8, %rax, %r25
-; EGPR-NDD-NEXT:    adcq %rdx, %r20
-; EGPR-NDD-NEXT:    setb %cl
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r20, %rax
-; EGPR-NDD-NEXT:    movzbl %cl, %ecx
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %rax, %r31
-; EGPR-NDD-NEXT:    adcq %rcx, %r21, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
-; EGPR-NDD-NEXT:    movq 48(%r15), %r11
-; EGPR-NDD-NEXT:    movq %r17, %rsi
-; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r20
-; EGPR-NDD-NEXT:    movq %rax, %r21
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq 56(%r15), %r17
-; EGPR-NDD-NEXT:    movq %rsi, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %sil
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rcx, %rax
-; EGPR-NDD-NEXT:    movzbl %sil, %ecx
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r21, %r31
-; EGPR-NDD-NEXT:    adcq %r8, %r20, %r10
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %rax, %rdi
-; EGPR-NDD-NEXT:    adcq %rcx, %r9, %r8
-; EGPR-NDD-NEXT:    setb %sil
+; EGPR-NDD-NEXT:    movq 104(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 112(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 120(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq (%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 8(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 16(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 24(%rsi), %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 32(%rsi), %r13
+; EGPR-NDD-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 40(%rsi), %r12
+; EGPR-NDD-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 48(%rsi), %rbx
+; EGPR-NDD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 56(%rsi), %r14
+; EGPR-NDD-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 64(%rsi), %r9
+; EGPR-NDD-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 72(%rsi), %r16
 ; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r20
-; EGPR-NDD-NEXT:    movq %rax, %r21
-; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r19, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %cl
-; EGPR-NDD-NEXT:    movq %r19, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r9, %rax
-; EGPR-NDD-NEXT:    movzbl %cl, %ecx
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r21, %rdi
-; EGPR-NDD-NEXT:    adcq %r20, %r8
-; EGPR-NDD-NEXT:    movzbl %sil, %edx
-; EGPR-NDD-NEXT:    adcq %rdx, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %r12, %r27
-; EGPR-NDD-NEXT:    movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r13, %r25, %r19
-; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbp, %r31
-; EGPR-NDD-NEXT:    movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbx, %r10
+; EGPR-NDD-NEXT:    movq 80(%rsi), %r10
 ; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r8, %r28
-; EGPR-NDD-NEXT:    adcq %rax, %r29
-; EGPR-NDD-NEXT:    adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    setb %r8b
+; EGPR-NDD-NEXT:    movq 88(%rsi), %r11
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    subq $8, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %rax, %rsi
+; EGPR-NDD-NEXT:    pushq %r11
+; EGPR-NDD-NEXT:    pushq %r10
+; EGPR-NDD-NEXT:    pushq %r16
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r15, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r13, %r9
+; EGPR-NDD-NEXT:    pushq %r14
+; EGPR-NDD-NEXT:    pushq %rbx
+; EGPR-NDD-NEXT:    pushq %r12
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r27
-; EGPR-NDD-NEXT:    movq %rax, %r20
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r27, %rax, %rdi
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi
-; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    movq %r10, %r16
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rax, %rsi
-; EGPR-NDD-NEXT:    movzbl %r9b, %eax
-; EGPR-NDD-NEXT:    adcq %rax, %rdx, %r9
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r21
-; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %rsi
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %rax, %r21
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r21, %rax, %rbx
-; EGPR-NDD-NEXT:    adcq %rdx, %r10
-; EGPR-NDD-NEXT:    setb %r31b
-; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r10, %rax
-; EGPR-NDD-NEXT:    movzbl %r31b, %r10d
-; EGPR-NDD-NEXT:    adcq %r10, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r20, %r10
-; EGPR-NDD-NEXT:    adcq %rdx, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rsi
-; EGPR-NDD-NEXT:    adcq $0, %r9
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r20
-; EGPR-NDD-NEXT:    movq %rax, %r21
-; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r31
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %r31
-; EGPR-NDD-NEXT:    setb %bpl
-; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r31, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r31d
-; EGPR-NDD-NEXT:    adcq %r31, %rdx
-; EGPR-NDD-NEXT:    addq %r21, %r10
-; EGPR-NDD-NEXT:    adcq %r20, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %rsi
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %r31b
-; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r20
-; EGPR-NDD-NEXT:    movq %rax, %r21
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %r12
-; EGPR-NDD-NEXT:    setb %bpl
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r12, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
-; EGPR-NDD-NEXT:    adcq %r12, %rdx
-; EGPR-NDD-NEXT:    addq %r21, %rsi
-; EGPR-NDD-NEXT:    adcq %r20, %r9
-; EGPR-NDD-NEXT:    movzbl %r31b, %r31d
-; EGPR-NDD-NEXT:    adcq %r31, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %r19
-; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbx, %r28
-; EGPR-NDD-NEXT:    movq %r28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r29, %r10
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rdi, %rcx
-; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movzbl %r8b, %ecx
+; EGPR-NDD-NEXT:    movq %r12, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %rbx, %rsi
+; EGPR-NDD-NEXT:    movq %r12, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r12, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r12, %r9
+; EGPR-NDD-NEXT:    movq %r12, %r13
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    movq %rbx, %r14
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r13, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r15, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r12, %rcx
+; EGPR-NDD-NEXT:    movq %rbx, %r8
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r12, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %rbx
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r13, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r12, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r14
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r13, %rsi
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r15, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %r13
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r15, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r13
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r14, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r14, %rsi
+; EGPR-NDD-NEXT:    movq %rbx, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %rbx
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r15, %rsi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r12, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %r14, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %rbx
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r15, %rsi
+; EGPR-NDD-NEXT:    movq %r12, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq %rbx, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, %r9
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq %r14
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %r12, %rsi
+; EGPR-NDD-NEXT:    movq %r13, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %rbx, %rsi
+; EGPR-NDD-NEXT:    movq %r14, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $24, %rsp
+; EGPR-NDD-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT:    movq %rbx, %rsi
+; EGPR-NDD-NEXT:    movq %r14, %rdx
+; EGPR-NDD-NEXT:    xorl %ecx, %ecx
+; EGPR-NDD-NEXT:    xorl %r8d, %r8d
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq $0
+; EGPR-NDD-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    callq __multi5 at PLT
+; EGPR-NDD-NEXT:    addq $32, %rsp
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rax, %rdi
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx, %r8
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    addq %rdx, %rax
 ; EGPR-NDD-NEXT:    adcq %rsi, %rcx
-; EGPR-NDD-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; EGPR-NDD-NEXT:    setb %dl
+; EGPR-NDD-NEXT:    movzbl %dl, %edx
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rax, %rsi
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx, %r9
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rax, %rbx
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r17
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r18
+; EGPR-NDD-NEXT:    addq %r17, %r11
+; EGPR-NDD-NEXT:    adcq %r18, %r16
+; EGPR-NDD-NEXT:    setb %r17b
+; EGPR-NDD-NEXT:    movzbl %r17b, %r17d
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r18
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT:    adcq %r17, %rdi
+; EGPR-NDD-NEXT:    adcq %r18, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rsi, %r17
 ; EGPR-NDD-NEXT:    adcq $0, %r9
-; EGPR-NDD-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq 64(%r22), %r20
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    movq %rdx, %r27
-; EGPR-NDD-NEXT:    movq %rax, %r28
-; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq 72(%r22), %r21
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %sil
-; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    movzbl %sil, %eax
-; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    movq %rdx, %r29
-; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    addq %rax, %r29
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %r29, %rax
-; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rdx, %rdi
-; EGPR-NDD-NEXT:    setb %r8b
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %rdi, %rax
-; EGPR-NDD-NEXT:    movzbl %r8b, %edi
-; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r28, %rdi
-; EGPR-NDD-NEXT:    adcq %rdx, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rsi
-; EGPR-NDD-NEXT:    movq 80(%r22), %r8
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT:    movq 88(%r22), %rbx
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %r10b
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %r9, %rax
-; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
-; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %r29, %rdi
-; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r27, %r28, %rbp
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi
-; EGPR-NDD-NEXT:    setb %dil
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %r10b
-; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %r9, %rax
-; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
-; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %rcx, %r29, %r27
-; EGPR-NDD-NEXT:    adcq %rsi, %r28, %r12
-; EGPR-NDD-NEXT:    movzbl %dil, %r19d
-; EGPR-NDD-NEXT:    adcq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r29
-; EGPR-NDD-NEXT:    imulq %r30, %rbx
-; EGPR-NDD-NEXT:    movq %r30, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rax, %r28
-; EGPR-NDD-NEXT:    addq %rbx, %rdx
-; EGPR-NDD-NEXT:    imulq %r18, %r8
-; EGPR-NDD-NEXT:    addq %rdx, %r8
-; EGPR-NDD-NEXT:    imulq %r21, %r11, %rcx
-; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    addq %rdx, %rcx
-; EGPR-NDD-NEXT:    imulq %r20, %r17, %r16
-; EGPR-NDD-NEXT:    addq %r16, %rcx
-; EGPR-NDD-NEXT:    addq %r28, %rax, %rsi
-; EGPR-NDD-NEXT:    adcq %rcx, %r8
-; EGPR-NDD-NEXT:    movq %r20, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %r28, %rax, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq %r20, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq %rdx, %rdi
-; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rdi, %rax
-; EGPR-NDD-NEXT:    movzbl %r9b, %edi
-; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %rsi
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    movq 112(%r22), %rdi
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %rdi
-; EGPR-NDD-NEXT:    movq %rax, %r30
-; EGPR-NDD-NEXT:    imulq %r23, %rdi
-; EGPR-NDD-NEXT:    addq %rdi, %rdx
-; EGPR-NDD-NEXT:    imulq 120(%r22), %r24, %rax
-; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %r9
-; EGPR-NDD-NEXT:    movq 96(%r22), %r20
-; EGPR-NDD-NEXT:    movq 104(%r22), %rdi
-; EGPR-NDD-NEXT:    imulq %rdi, %r26, %r10
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r20
-; EGPR-NDD-NEXT:    addq %r10, %rdx
-; EGPR-NDD-NEXT:    imulq %r20, %r14, %r25
-; EGPR-NDD-NEXT:    addq %r25, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r20, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r25
-; EGPR-NDD-NEXT:    movq %rax, %r26
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r25
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT:    movq %r20, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %r25
-; EGPR-NDD-NEXT:    adcq %rdx, %r10
-; EGPR-NDD-NEXT:    setb %r11b
-; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r10, %rax
-; EGPR-NDD-NEXT:    movzbl %r11b, %edi
-; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq %r30, %rax
-; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %r26
-; EGPR-NDD-NEXT:    adcq %r25, %rcx
-; EGPR-NDD-NEXT:    adcq %rsi, %rax
-; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %r26, %r27, %rbx
-; EGPR-NDD-NEXT:    adcq %rcx, %r12
-; EGPR-NDD-NEXT:    adcq %rax, %r19, %r13
-; EGPR-NDD-NEXT:    adcq %rdx, %r29, %r28
-; EGPR-NDD-NEXT:    movq 80(%r15), %r24
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    movq %rax, %r30
-; EGPR-NDD-NEXT:    movq %rdx, %rdi
-; EGPR-NDD-NEXT:    movq 88(%r15), %r22
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    addq %rax, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %rdi
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %sil
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    movzbl %sil, %eax
-; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq 64(%r15), %r26
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    movq %rax, %r21
-; EGPR-NDD-NEXT:    movq %rdx, %r31
-; EGPR-NDD-NEXT:    movq 72(%r15), %r25
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    addq %rax, %r31
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r31, %rax, %r29
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r8, %rax
-; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
-; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r30, %r20
-; EGPR-NDD-NEXT:    adcq %rdx, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rsi
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r30, %rax, %r27
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r25, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r8, %rax
-; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
-; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %r20
-; EGPR-NDD-NEXT:    adcq %rdi, %r27
-; EGPR-NDD-NEXT:    adcq $0, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi
-; EGPR-NDD-NEXT:    setb %dil
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    addq %rax, %r30
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r30, %rax, %r19
-; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r18
+; EGPR-NDD-NEXT:    adcq $0, %r10
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r22
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rdx, %r23
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi, %r24
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT:    addq %r19, %rdx
+; EGPR-NDD-NEXT:    adcq %r22, %rsi
+; EGPR-NDD-NEXT:    setb %r19b
+; EGPR-NDD-NEXT:    movzbl %r19b, %r19d
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %rdx, %r22
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi, %r25
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r11, %rdx
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r16, %rsi
+; EGPR-NDD-NEXT:    adcq %r23, %rdi
+; EGPR-NDD-NEXT:    adcq %r24, %r8
+; EGPR-NDD-NEXT:    adcq $0, %r22
+; EGPR-NDD-NEXT:    adcq $0, %r25
+; EGPR-NDD-NEXT:    adcq $0, %r19
+; EGPR-NDD-NEXT:    adcq $0, %r26
+; EGPR-NDD-NEXT:    addq %r22, %r17
+; EGPR-NDD-NEXT:    adcq %r9, %r25
+; EGPR-NDD-NEXT:    adcq %r19, %r18, %r11
+; EGPR-NDD-NEXT:    adcq %r26, %r10, %r16
 ; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r8, %rax
-; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
-; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %rcx, %r31
-; EGPR-NDD-NEXT:    adcq %rsi, %r19
-; EGPR-NDD-NEXT:    movzbl %dil, %ecx
-; EGPR-NDD-NEXT:    adcq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq 96(%r15), %r30
-; EGPR-NDD-NEXT:    imulq %r11, %r30, %rsi
-; EGPR-NDD-NEXT:    movq %r30, %rax
-; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    movq %rax, %r18
-; EGPR-NDD-NEXT:    addq %rsi, %rdx
-; EGPR-NDD-NEXT:    movq 104(%r15), %r8
-; EGPR-NDD-NEXT:    imulq %r10, %r8, %rax
-; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %rsi
-; EGPR-NDD-NEXT:    movq 112(%r15), %rax
-; EGPR-NDD-NEXT:    imulq %r23, %rax, %r9
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    addq %r9, %rdx
-; EGPR-NDD-NEXT:    imulq 120(%r15), %r16, %r9
-; EGPR-NDD-NEXT:    addq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %r18, %rax, %r10
-; EGPR-NDD-NEXT:    adcq %rsi, %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    movq %r16, %r18
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    movq %rdx, %r17
-; EGPR-NDD-NEXT:    movq %rax, %rsi
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %r17, %rax, %r11
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r16
-; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %rax, %r11
-; EGPR-NDD-NEXT:    adcq %rdx, %r16
-; EGPR-NDD-NEXT:    setb %r17b
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %r16, %rax
-; EGPR-NDD-NEXT:    movzbl %r17b, %r8d
-; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r10
-; EGPR-NDD-NEXT:    adcq %r9, %rdx, %r17
-; EGPR-NDD-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r26, %r8 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    movq %rax, %r9
-; EGPR-NDD-NEXT:    addq %r8, %rdx
-; EGPR-NDD-NEXT:    imulq %r16, %r25, %rax
-; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %r8
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NDD-NEXT:    imulq %r23, %r24, %r16
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r30
-; EGPR-NDD-NEXT:    addq %r16, %rdx
-; EGPR-NDD-NEXT:    imulq %r30, %r22
-; EGPR-NDD-NEXT:    addq %r22, %rdx
-; EGPR-NDD-NEXT:    addq %r9, %rax, %r16
-; EGPR-NDD-NEXT:    adcq %r8, %rdx, %r18
-; EGPR-NDD-NEXT:    movq %r30, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r8
-; EGPR-NDD-NEXT:    movq %rax, %r9
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    movq %r23, %r24
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r22
-; EGPR-NDD-NEXT:    movq %r30, %rax
-; EGPR-NDD-NEXT:    mulq %r25
-; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq %rdx, %r22
-; EGPR-NDD-NEXT:    setb %r23b
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r25
-; EGPR-NDD-NEXT:    addq %r22, %rax
-; EGPR-NDD-NEXT:    movzbl %r23b, %r22d
-; EGPR-NDD-NEXT:    adcq %r22, %rdx
-; EGPR-NDD-NEXT:    addq %r16, %rax
-; EGPR-NDD-NEXT:    adcq %r18, %rdx
-; EGPR-NDD-NEXT:    addq %r9, %rsi
-; EGPR-NDD-NEXT:    adcq %r11, %r8
-; EGPR-NDD-NEXT:    adcq %r10, %rax
-; EGPR-NDD-NEXT:    adcq %r17, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %rsi
-; EGPR-NDD-NEXT:    adcq %r19, %r8
-; EGPR-NDD-NEXT:    adcq %rcx, %rax
-; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %rbp, %r27
-; EGPR-NDD-NEXT:    adcq %rbx, %rsi
-; EGPR-NDD-NEXT:    adcq %r12, %r8
-; EGPR-NDD-NEXT:    adcq %r13, %rax
-; EGPR-NDD-NEXT:    adcq %r28, %rdx
-; EGPR-NDD-NEXT:    addq %r21, {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r20, {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r27, {{[-0-9]+}}(%r{{[sb]}}p), %r27 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, (%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 8(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 16(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 24(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 32(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 40(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 48(%rcx)
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %rdi, 56(%rcx)
-; EGPR-NDD-NEXT:    movq %r21, 64(%rcx)
-; EGPR-NDD-NEXT:    movq %r29, 72(%rcx)
-; EGPR-NDD-NEXT:    movq %r20, 80(%rcx)
-; EGPR-NDD-NEXT:    movq %r27, 88(%rcx)
-; EGPR-NDD-NEXT:    movq %rsi, 96(%rcx)
-; EGPR-NDD-NEXT:    movq %r8, 104(%rcx)
-; EGPR-NDD-NEXT:    movq %rax, 112(%rcx)
-; EGPR-NDD-NEXT:    movq %rdx, 120(%rcx)
-; EGPR-NDD-NEXT:    addq $96, %rsp
+; EGPR-NDD-NEXT:    movzbl %r9b, %r18d
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r22
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r9, %r23
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r10, %r24
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT:    addq %r19, %r9
+; EGPR-NDD-NEXT:    adcq %r22, %r10
+; EGPR-NDD-NEXT:    setb %r19b
+; EGPR-NDD-NEXT:    movzbl %r19b, %r19d
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r9, %r22
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r10, %r26
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r17, %r9
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r25, %r10
+; EGPR-NDD-NEXT:    adcq %r23, %r11
+; EGPR-NDD-NEXT:    adcq %r24, %r16
+; EGPR-NDD-NEXT:    adcq %r18, %r22, %r17
+; EGPR-NDD-NEXT:    adcq $0, %r26, %r18
+; EGPR-NDD-NEXT:    adcq $0, %r19
+; EGPR-NDD-NEXT:    adcq $0, %r27, %r22
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r25
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r30
+; EGPR-NDD-NEXT:    addq %r27, %r25
+; EGPR-NDD-NEXT:    adcq %r30, %r26
+; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    movzbl %r27b, %r27d
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r30
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT:    addq %r21, %r25
+; EGPR-NDD-NEXT:    adcq %r28, %r26
+; EGPR-NDD-NEXT:    adcq %r20, %r27
+; EGPR-NDD-NEXT:    adcq %r31, %r30
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r29
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT:    addq %r29, %r21
+; EGPR-NDD-NEXT:    adcq %r14, %r28
+; EGPR-NDD-NEXT:    setb %r29b
+; EGPR-NDD-NEXT:    movzbl %r29b, %r29d
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r29
+; EGPR-NDD-NEXT:    adcq $0, {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r12
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r13
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT:    addq %r15, %r21
+; EGPR-NDD-NEXT:    adcq %r12, %r28
+; EGPR-NDD-NEXT:    adcq %r13, %r29
+; EGPR-NDD-NEXT:    adcq %r14, %rax
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT:    addq {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT:    adcq %r31, %r23
+; EGPR-NDD-NEXT:    adcq %r20, %r24
+; EGPR-NDD-NEXT:    adcq %r21, %r25
+; EGPR-NDD-NEXT:    adcq %r28, %r26
+; EGPR-NDD-NEXT:    adcq %r29, %r27
+; EGPR-NDD-NEXT:    adcq %r30, %rax
+; EGPR-NDD-NEXT:    addq %r14, %r9
+; EGPR-NDD-NEXT:    adcq %r15, %r10
+; EGPR-NDD-NEXT:    adcq %r23, %r11
+; EGPR-NDD-NEXT:    adcq %r24, %r16
+; EGPR-NDD-NEXT:    adcq %r25, %r17
+; EGPR-NDD-NEXT:    adcq %r26, %r18
+; EGPR-NDD-NEXT:    adcq %r27, %r19
+; EGPR-NDD-NEXT:    adcq %r22, %rax
+; EGPR-NDD-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r22 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %rbx, 16(%r22)
+; EGPR-NDD-NEXT:    movq %rcx, 24(%r22)
+; EGPR-NDD-NEXT:    movq %rdx, 32(%r22)
+; EGPR-NDD-NEXT:    movq %rsi, 40(%r22)
+; EGPR-NDD-NEXT:    movq %rdi, 48(%r22)
+; EGPR-NDD-NEXT:    movq %r8, 56(%r22)
+; EGPR-NDD-NEXT:    movq %r9, 64(%r22)
+; EGPR-NDD-NEXT:    movq %r10, 72(%r22)
+; EGPR-NDD-NEXT:    movq %r11, 80(%r22)
+; EGPR-NDD-NEXT:    movq %r16, 88(%r22)
+; EGPR-NDD-NEXT:    movq %r17, 96(%r22)
+; EGPR-NDD-NEXT:    movq %r18, 104(%r22)
+; EGPR-NDD-NEXT:    movq %r19, 112(%r22)
+; EGPR-NDD-NEXT:    movq %rax, 120(%r22)
+; EGPR-NDD-NEXT:    movaps %xmm0, (%r22)
+; EGPR-NDD-NEXT:    leaq -40(%rbp), %rsp
 ; EGPR-NDD-NEXT:    popq %rbx
 ; EGPR-NDD-NEXT:    popq %r12
 ; EGPR-NDD-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 7070848e3fe3e..e462867360b3b 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1228,7 +1228,7 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; AVX-NEXT:    shrl $3, %ecx
 ; AVX-NEXT:    andl $28, %ecx
 ; AVX-NEXT:    andl %eax, (%rdi,%rcx)
-; AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX-NEXT:    movl (%rdi), %ecx
 ; AVX-NEXT:    movl (%rsi), %eax
 ; AVX-NEXT:    movl %ecx, (%rsi)
diff --git a/llvm/test/CodeGen/X86/cmp-i256.ll b/llvm/test/CodeGen/X86/cmp-i256.ll
new file mode 100644
index 0000000000000..213a355a8f223
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cmp-i256.ll
@@ -0,0 +1,450 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+define i32 @icmp_slt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_slt_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_slt_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    setl %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp slt i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_sgt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sgt_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_sgt_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %rdi, %r8
+; X64-NEXT:    sbbq %rsi, %r9
+; X64-NEXT:    sbbq %rdx, %r10
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp sgt i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_sle_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sle_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setge %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_sle_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %rdi, %r8
+; X64-NEXT:    sbbq %rsi, %r9
+; X64-NEXT:    sbbq %rdx, %r10
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setge %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp sle i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_sge_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sge_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setge %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_sge_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    setge %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp sge i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_ult_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ult_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_ult_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp ult i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_ugt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ugt_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_ugt_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %rdi, %r8
+; X64-NEXT:    sbbq %rsi, %r9
+; X64-NEXT:    sbbq %rdx, %r10
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp ugt i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_ule_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ule_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_ule_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %rdi, %r8
+; X64-NEXT:    sbbq %rsi, %r9
+; X64-NEXT:    sbbq %rdx, %r10
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp ule i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @icmp_uge_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_uge_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: icmp_uge_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %c = icmp uge i256 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+; Select based on i256 comparison
+define i256 @select_slt_i256(i256 %a, i256 %b, i256 %x, i256 %y) nounwind {
+; X86-LABEL: select_slt_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jl .LBB8_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jge .LBB8_5
+; X86-NEXT:  .LBB8_4:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    jge .LBB8_8
+; X86-NEXT:  .LBB8_7:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    jge .LBB8_11
+; X86-NEXT:  .LBB8_10:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    jge .LBB8_14
+; X86-NEXT:  .LBB8_13:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    jge .LBB8_17
+; X86-NEXT:  .LBB8_16:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jge .LBB8_20
+; X86-NEXT:  .LBB8_19:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    jmp .LBB8_21
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jl .LBB8_4
+; X86-NEXT:  .LBB8_5:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    jl .LBB8_7
+; X86-NEXT:  .LBB8_8:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    jl .LBB8_10
+; X86-NEXT:  .LBB8_11:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    jl .LBB8_13
+; X86-NEXT:  .LBB8_14:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    jl .LBB8_16
+; X86-NEXT:  .LBB8_17:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jl .LBB8_19
+; X86-NEXT:  .LBB8_20:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:  .LBB8_21:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl (%edx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl (%edx), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl (%esi), %esi
+; X86-NEXT:    movl (%edi), %edi
+; X86-NEXT:    movl (%ebx), %ebx
+; X86-NEXT:    movl (%ebp), %ebp
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    jl .LBB8_22
+; X86-NEXT:  # %bb.23:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    jmp .LBB8_24
+; X86-NEXT:  .LBB8_22:
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  .LBB8_24:
+; X86-NEXT:    movl (%edx), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl %ebp, 20(%eax)
+; X86-NEXT:    movl %ebx, 16(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: select_slt_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmpq %r9, %rsi
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    cmovlq %rcx, %rdx
+; X64-NEXT:    movq (%rdx), %rcx
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    cmovlq %rdx, %rsi
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    cmovlq %rsi, %rdi
+; X64-NEXT:    movq (%rdi), %rsi
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    cmovlq %rdi, %r8
+; X64-NEXT:    movq (%r8), %rdi
+; X64-NEXT:    movq %rdi, 24(%rax)
+; X64-NEXT:    movq %rsi, 16(%rax)
+; X64-NEXT:    movq %rdx, 8(%rax)
+; X64-NEXT:    movq %rcx, (%rax)
+; X64-NEXT:    retq
+  %c = icmp slt i256 %a, %b
+  %r = select i1 %c, i256 %x, i256 %y
+  ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index 3efd536adc4d1..f84293a26e102 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -106,24 +106,26 @@ define i96 @square_high(i96 %x) nounwind {
 ;
 ; X64-LABEL: square_high:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    addq %r8, %rdx
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    adcq %rsi, %rax
-; X64-NEXT:    imulq %rcx, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    shrdq $32, %rcx, %r8
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %rdx
+; X64-NEXT:    callq ___multi5
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 entry:
   %conv = zext i96 %x to i192
diff --git a/llvm/test/CodeGen/X86/div-i256.ll b/llvm/test/CodeGen/X86/div-i256.ll
new file mode 100644
index 0000000000000..b57c6f5dec5d0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/div-i256.ll
@@ -0,0 +1,5475 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+define i256 @udiv256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: udiv256:
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $288, %esp # imm = 0x120
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl 72(%ebp), %ebx
+; X86-NEXT:    movl 56(%ebp), %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl 64(%ebp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 68(%ebp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    orl 60(%ebp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    sete %al
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 40(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    orl 36(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    orl 28(%ebp), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    sete %ah
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB0_3: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ebp), %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 56(%ebp), %ebx
+; X86-NEXT:    jne .LBB0_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl 60(%ebp), %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    orl 72(%ebp), %edi
+; X86-NEXT:    je .LBB0_7
+; X86-NEXT:    jmp .LBB0_8
+; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    bsrl %edx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl 72(%ebp), %edi
+; X86-NEXT:    jne .LBB0_8
+; X86-NEXT:  .LBB0_7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB0_8: # %_udiv-special-cases
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    jne .LBB0_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:  .LBB0_9:
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB0_11: # %_udiv-special-cases
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB0_12
+; X86-NEXT:  # %bb.13: # %_udiv-special-cases
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    je .LBB0_15
+; X86-NEXT:    jmp .LBB0_16
+; X86-NEXT:  .LBB0_12:
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    jne .LBB0_16
+; X86-NEXT:  .LBB0_15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB0_16: # %_udiv-special-cases
+; X86-NEXT:    movl 64(%ebp), %esi
+; X86-NEXT:    orl 72(%ebp), %esi
+; X86-NEXT:    movl 60(%ebp), %edi
+; X86-NEXT:    orl 68(%ebp), %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    jne .LBB0_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB0_18: # %_udiv-special-cases
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    jne .LBB0_19
+; X86-NEXT:  # %bb.20: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    je .LBB0_23
+; X86-NEXT:  .LBB0_22:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    je .LBB0_25
+; X86-NEXT:    jmp .LBB0_26
+; X86-NEXT:  .LBB0_19:
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    jne .LBB0_22
+; X86-NEXT:  .LBB0_23: # %_udiv-special-cases
+; X86-NEXT:    bsrl 28(%ebp), %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    jne .LBB0_26
+; X86-NEXT:  .LBB0_25: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB0_26: # %_udiv-special-cases
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    jne .LBB0_27
+; X86-NEXT:  # %bb.28: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB0_31
+; X86-NEXT:  .LBB0_30:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl 24(%ebp), %ebx
+; X86-NEXT:    je .LBB0_33
+; X86-NEXT:    jmp .LBB0_34
+; X86-NEXT:  .LBB0_27:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB0_30
+; X86-NEXT:  .LBB0_31: # %_udiv-special-cases
+; X86-NEXT:    bsrl 12(%ebp), %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    orl 24(%ebp), %ebx
+; X86-NEXT:    jne .LBB0_34
+; X86-NEXT:  .LBB0_33: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB0_34: # %_udiv-special-cases
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    orl 40(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    orl 36(%ebp), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    jne .LBB0_36
+; X86-NEXT:  # %bb.35: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB0_36: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB0_37
+; X86-NEXT:  # %bb.38: # %select.false.sink
+; X86-NEXT:    movl $255, %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB0_39: # %select.end
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    jne .LBB0_41
+; X86-NEXT:  # %bb.40: # %select.end
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:  .LBB0_41: # %select.end
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB0_42
+; X86-NEXT:  # %bb.48: # %select.end
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $255, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NEXT:    je .LBB0_49
+; X86-NEXT:  # %bb.46: # %udiv-bb1
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 264(%esp,%edx), %esi
+; X86-NEXT:    movl 268(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 260(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 256(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 252(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 248(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 240(%esp,%edx), %eax
+; X86-NEXT:    movl 244(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jb .LBB0_47
+; X86-NEXT:  # %bb.43: # %udiv-preheader
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 172(%esp,%edx,4), %edi
+; X86-NEXT:    movl 168(%esp,%edx,4), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 164(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 160(%esp,%edx,4), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 156(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 152(%esp,%edx,4), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%edx,4), %esi
+; X86-NEXT:    movl 148(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB0_44: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 72(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 68(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 64(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl 60(%ebp), %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 56(%ebp), %esi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl 52(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 48(%ebp), %eax
+; X86-NEXT:    andl 44(%ebp), %ecx
+; X86-NEXT:    subl %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB0_44
+; X86-NEXT:  .LBB0_45: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    leal (%esi,%edx,2), %edi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:  .LBB0_49: # %udiv-end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB0_37:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jmp .LBB0_39
+; X86-NEXT:  .LBB0_47:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB0_45
+; X86-NEXT:  .LBB0_42:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB0_49
+;
+; X64-LABEL: udiv256:
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq 24(%rbp), %r15
+; X64-NEXT:    movq 32(%rbp), %r10
+; X64-NEXT:    movq 16(%rbp), %r11
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    orq %r10, %rdi
+; X64-NEXT:    movq %r9, %rbx
+; X64-NEXT:    orq %r15, %rbx
+; X64-NEXT:    orq %rdi, %rbx
+; X64-NEXT:    sete %bl
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    orq %r8, %r14
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    orq %r14, %rdi
+; X64-NEXT:    sete %dil
+; X64-NEXT:    orb %bl, %dil
+; X64-NEXT:    bsrq %r10, %r14
+; X64-NEXT:    xorq $63, %r14
+; X64-NEXT:    bsrq %r15, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    orq $64, %rbx
+; X64-NEXT:    testq %r10, %r10
+; X64-NEXT:    cmovneq %r14, %rbx
+; X64-NEXT:    bsrq %r11, %r14
+; X64-NEXT:    xorq $63, %r14
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bsrq %r9, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    orq $64, %r9
+; X64-NEXT:    testq %r11, %r11
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    cmovneq %r14, %r9
+; X64-NEXT:    orq $128, %r9
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    orq %r10, %rdx
+; X64-NEXT:    cmovneq %rbx, %r9
+; X64-NEXT:    bsrq %r8, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    bsrq %rcx, %rdx
+; X64-NEXT:    xorq $63, %rdx
+; X64-NEXT:    orq $64, %rdx
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    cmovneq %r10, %rdx
+; X64-NEXT:    bsrq %r11, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    bsrq %rsi, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    orq $64, %rbx
+; X64-NEXT:    testq %r11, %r11
+; X64-NEXT:    cmovneq %r10, %rbx
+; X64-NEXT:    orq $128, %rbx
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r8, %r10
+; X64-NEXT:    cmovneq %rdx, %rbx
+; X64-NEXT:    xorl %r10d, %r10d
+; X64-NEXT:    subq %rbx, %r9
+; X64-NEXT:    movl $0, %ebx
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    sbbq %r14, %r14
+; X64-NEXT:    sbbq %r10, %r10
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl $255, %ecx
+; X64-NEXT:    cmpq %r9, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %rbx, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r14, %rcx
+; X64-NEXT:    sbbq %r10, %rdx
+; X64-NEXT:    setb %cl
+; X64-NEXT:  .LBB0_3: # %select.end
+; X64-NEXT:    xorl %r12d, %r12d
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    cmovneq %r12, %r11
+; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    cmovneq %r12, %rcx
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    cmovneq %r12, %rdx
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    jne .LBB0_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    xorq $255, %rsi
+; X64-NEXT:    orq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    orq %r10, %rdi
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    je .LBB0_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %r11
+; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r9d, %ecx
+; X64-NEXT:    notb %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %rdi
+; X64-NEXT:    movq 240(%rsp,%rdi), %rsi
+; X64-NEXT:    movq 248(%rsp,%rdi), %rax
+; X64-NEXT:    shldq %cl, %rsi, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 224(%rsp,%rdi), %rdx
+; X64-NEXT:    movq 232(%rsp,%rdi), %rax
+; X64-NEXT:    shldq %cl, %rax, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shldq %cl, %rdx, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shlq %cl, %rdx
+; X64-NEXT:    addq $1, %r9
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    jb .LBB0_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r9d, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %edi
+; X64-NEXT:    movq 152(%rsp,%rdi,8), %r8
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    movq 144(%rsp,%rdi,8), %r9
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    shrdq %cl, %r8, %rsi
+; X64-NEXT:    movq 128(%rsp,%rdi,8), %rbx
+; X64-NEXT:    movq 136(%rsp,%rdi,8), %rdi
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    shrdq %cl, %r9, %r14
+; X64-NEXT:    shrq %cl, %r8
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    addq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%rbp), %rax
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 24(%rbp), %rax
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 32(%rbp), %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB0_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %rsi, %r8
+; X64-NEXT:    shldq $1, %r14, %rsi
+; X64-NEXT:    shldq $1, %rbx, %r14
+; X64-NEXT:    shldq $1, %r12, %rbx
+; X64-NEXT:    shldq $1, %r10, %r12
+; X64-NEXT:    orq %r15, %r12
+; X64-NEXT:    shldq $1, %r11, %r10
+; X64-NEXT:    orq %rdi, %r10
+; X64-NEXT:    shldq $1, %rdx, %r11
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    addq %rdx, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    cmpq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r14, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rsi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %r8, %rdi
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    andq %rax, %r9
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    movq 24(%rbp), %rax
+; X64-NEXT:    andq %rax, %r15
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    andq 16(%rbp), %r13
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    subq %rdi, %rbx
+; X64-NEXT:    sbbq %r13, %r14
+; X64-NEXT:    movq 32(%rbp), %rax
+; X64-NEXT:    sbbq %r15, %rsi
+; X64-NEXT:    sbbq %r9, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    addq $-1, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r13
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r13, %rdi
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r15, %r9
+; X64-NEXT:    orq %rdi, %r9
+; X64-NEXT:    movl $0, %r9d
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    jne .LBB0_7
+; X64-NEXT:  .LBB0_8: # %udiv-loop-exit
+; X64-NEXT:    shldq $1, %r10, %r12
+; X64-NEXT:    shldq $1, %r11, %r10
+; X64-NEXT:    shldq $1, %rdx, %r11
+; X64-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %r10, %rdx
+; X64-NEXT:  .LBB0_9: # %udiv-end
+; X64-NEXT:    movq %rcx, (%rax)
+; X64-NEXT:    movq %r11, 8(%rax)
+; X64-NEXT:    movq %rdx, 16(%rax)
+; X64-NEXT:    movq %r12, 24(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    movb $1, %cl
+; X64-NEXT:    jmp .LBB0_3
+; X64-NEXT:  .LBB0_10:
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    jmp .LBB0_8
+  %r = udiv i256 %a, %b
+  ret i256 %r
+}
+
+define i256 @sdiv256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: sdiv256:
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $320, %esp # imm = 0x140
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 72(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ebp), %ebx
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl 60(%ebp), %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    sete %ch
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB1_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    jmp .LBB1_3
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:  .LBB1_3: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB1_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB1_6
+; X86-NEXT:  .LBB1_4:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB1_6: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    jne .LBB1_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB1_8: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jne .LBB1_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB1_13
+; X86-NEXT:  .LBB1_12:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    je .LBB1_15
+; X86-NEXT:    jmp .LBB1_16
+; X86-NEXT:  .LBB1_9:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB1_12
+; X86-NEXT:  .LBB1_13: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB1_16
+; X86-NEXT:  .LBB1_15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB1_16: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB1_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB1_18: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jne .LBB1_19
+; X86-NEXT:  # %bb.20: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB1_23
+; X86-NEXT:  .LBB1_22:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    je .LBB1_25
+; X86-NEXT:    jmp .LBB1_26
+; X86-NEXT:  .LBB1_19:
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB1_22
+; X86-NEXT:  .LBB1_23: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    jne .LBB1_26
+; X86-NEXT:  .LBB1_25: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB1_26: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB1_27
+; X86-NEXT:  # %bb.28: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB1_31
+; X86-NEXT:  .LBB1_30:
+; X86-NEXT:    bsrl %ebx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    jmp .LBB1_32
+; X86-NEXT:  .LBB1_27:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB1_30
+; X86-NEXT:  .LBB1_31: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:  .LBB1_32: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB1_34
+; X86-NEXT:  # %bb.33: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:  .LBB1_34: # %_udiv-special-cases
+; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB1_36
+; X86-NEXT:  # %bb.35: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB1_36: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    subl %esi, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_37
+; X86-NEXT:  # %bb.38: # %select.false.sink
+; X86-NEXT:    movl $255, %ecx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB1_39: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    jne .LBB1_41
+; X86-NEXT:  # %bb.40: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:  .LBB1_41: # %select.end
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jne .LBB1_42
+; X86-NEXT:  # %bb.48: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorl $255, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    je .LBB1_49
+; X86-NEXT:  # %bb.46: # %udiv-bb1
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 296(%esp,%edx), %esi
+; X86-NEXT:    movl 300(%esp,%edx), %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 292(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 288(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 284(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 280(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 272(%esp,%edx), %edi
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl 276(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB1_47
+; X86-NEXT:  # %bb.43: # %udiv-preheader
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 204(%esp,%eax,4), %edi
+; X86-NEXT:    movl 200(%esp,%eax,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 196(%esp,%eax,4), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 192(%esp,%eax,4), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 188(%esp,%eax,4), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 184(%esp,%eax,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 176(%esp,%eax,4), %edx
+; X86-NEXT:    movl 180(%esp,%eax,4), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB1_44: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    subl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB1_44
+; X86-NEXT:  .LBB1_45: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    leal (%ebx,%edi,2), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:  .LBB1_49: # %udiv-end
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    xorl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    subl %edi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %ebx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB1_37:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB1_39
+; X86-NEXT:  .LBB1_47:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB1_45
+; X86-NEXT:  .LBB1_42:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jmp .LBB1_49
+;
+; X64-LABEL: sdiv256:
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 32(%rbp), %r13
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    sarq $63, %r10
+; X64-NEXT:    xorq %r10, %r8
+; X64-NEXT:    xorq %r10, %rcx
+; X64-NEXT:    xorq %r10, %rdx
+; X64-NEXT:    xorq %r10, %rsi
+; X64-NEXT:    subq %r10, %rsi
+; X64-NEXT:    sbbq %r10, %rdx
+; X64-NEXT:    sbbq %r10, %rcx
+; X64-NEXT:    sbbq %r10, %r8
+; X64-NEXT:    movq %r13, %r11
+; X64-NEXT:    sarq $63, %r11
+; X64-NEXT:    xorq %r11, %r13
+; X64-NEXT:    movq 24(%rbp), %rax
+; X64-NEXT:    xorq %r11, %rax
+; X64-NEXT:    movq 16(%rbp), %r14
+; X64-NEXT:    xorq %r11, %r14
+; X64-NEXT:    xorq %r11, %r9
+; X64-NEXT:    subq %r11, %r9
+; X64-NEXT:    sbbq %r11, %r14
+; X64-NEXT:    sbbq %r11, %rax
+; X64-NEXT:    sbbq %r11, %r13
+; X64-NEXT:    movq %r14, %rbx
+; X64-NEXT:    orq %r13, %rbx
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    orq %rbx, %rdi
+; X64-NEXT:    sete %dil
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    orq %r8, %rbx
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    orq %rcx, %r15
+; X64-NEXT:    orq %rbx, %r15
+; X64-NEXT:    sete %bl
+; X64-NEXT:    orb %dil, %bl
+; X64-NEXT:    bsrq %r13, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %rax, %r15
+; X64-NEXT:    xorq $63, %r15
+; X64-NEXT:    orq $64, %r15
+; X64-NEXT:    testq %r13, %r13
+; X64-NEXT:    cmovneq %rdi, %r15
+; X64-NEXT:    bsrq %r14, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bsrq %r9, %r12
+; X64-NEXT:    xorq $63, %r12
+; X64-NEXT:    orq $64, %r12
+; X64-NEXT:    testq %r14, %r14
+; X64-NEXT:    cmovneq %rdi, %r12
+; X64-NEXT:    orq $128, %r12
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r13, %rdi
+; X64-NEXT:    cmovneq %r15, %r12
+; X64-NEXT:    bsrq %r8, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %rcx, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    orq $64, %r9
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    cmovneq %rdi, %r9
+; X64-NEXT:    bsrq %rdx, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    movq %rsi, (%rsp) # 8-byte Spill
+; X64-NEXT:    bsrq %rsi, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    orq $64, %rdi
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %rax, %rdi
+; X64-NEXT:    orq $128, %rdi
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    cmovneq %r9, %rdi
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    subq %rdi, %r12
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    sbbq %r15, %r15
+; X64-NEXT:    movl $0, %r13d
+; X64-NEXT:    sbbq %r13, %r13
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    testb %bl, %bl
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jne .LBB1_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movl $255, %esi
+; X64-NEXT:    cmpq %r12, %rsi
+; X64-NEXT:    movl $0, %esi
+; X64-NEXT:    sbbq %r15, %rsi
+; X64-NEXT:    movl $0, %esi
+; X64-NEXT:    sbbq %r13, %rsi
+; X64-NEXT:    sbbq %r9, %rcx
+; X64-NEXT:    setb %cl
+; X64-NEXT:  .LBB1_3: # %select.end
+; X64-NEXT:    xorq %r10, %r11
+; X64-NEXT:    xorl %r10d, %r10d
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    cmovneq %r10, %rbx
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    cmovneq %r10, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    cmovneq %r10, %rdx
+; X64-NEXT:    cmoveq %r8, %r10
+; X64-NEXT:    jne .LBB1_4
+; X64-NEXT:  # %bb.10: # %select.end
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    xorq $255, %rsi
+; X64-NEXT:    orq %r13, %rsi
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    orq %r9, %rdi
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    je .LBB1_11
+; X64-NEXT:  # %bb.8: # %udiv-bb1
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r12d, %ecx
+; X64-NEXT:    notb %cl
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    shrb $3, %dl
+; X64-NEXT:    andb $24, %dl
+; X64-NEXT:    negb %dl
+; X64-NEXT:    movsbq %dl, %rdx
+; X64-NEXT:    movq 240(%rsp,%rdx), %rdi
+; X64-NEXT:    movq 248(%rsp,%rdx), %rsi
+; X64-NEXT:    shldq %cl, %rdi, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 224(%rsp,%rdx), %rsi
+; X64-NEXT:    movq 232(%rsp,%rdx), %r11
+; X64-NEXT:    shldq %cl, %r11, %rdi
+; X64-NEXT:    movq %rdi, %rdx
+; X64-NEXT:    shldq %cl, %rsi, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shlq %cl, %rsi
+; X64-NEXT:    addq $1, %r12
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    jb .LBB1_9
+; X64-NEXT:  # %bb.5: # %udiv-preheader
+; X64-NEXT:    movq %r15, %r10
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r12d, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %edi
+; X64-NEXT:    movq 152(%rsp,%rdi,8), %r8
+; X64-NEXT:    movq 144(%rsp,%rdi,8), %r11
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movl %r12d, %ecx
+; X64-NEXT:    shrdq %cl, %r8, %rax
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    movq 128(%rsp,%rdi,8), %r12
+; X64-NEXT:    movq 136(%rsp,%rdi,8), %rdi
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    shrdq %cl, %r11, %r14
+; X64-NEXT:    shrq %cl, %r8
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrdq %cl, %rdi, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %r13d, %r13d
+; X64-NEXT:    movq %r9, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB1_6: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %rax, %r8
+; X64-NEXT:    shldq $1, %r14, %rax
+; X64-NEXT:    shldq $1, %r12, %r14
+; X64-NEXT:    shldq $1, %r10, %r12
+; X64-NEXT:    shldq $1, %rdx, %r10
+; X64-NEXT:    orq %r13, %r10
+; X64-NEXT:    shldq $1, %rbx, %rdx
+; X64-NEXT:    orq %rdi, %rdx
+; X64-NEXT:    shldq $1, %rsi, %rbx
+; X64-NEXT:    orq %r15, %rbx
+; X64-NEXT:    addq %rsi, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    cmpq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r14, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %r8, %rdi
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    subq %rdi, %r12
+; X64-NEXT:    sbbq %r11, %r14
+; X64-NEXT:    sbbq %r13, %rax
+; X64-NEXT:    sbbq %r15, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    addq $-1, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rdi
+; X64-NEXT:    adcq $-1, %r9
+; X64-NEXT:    movq (%rsp), %r15 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r15
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, (%rsp) # 8-byte Spill
+; X64-NEXT:    orq %r15, %rdi
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    orq %rdi, %r11
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %r13d
+; X64-NEXT:    jne .LBB1_6
+; X64-NEXT:  .LBB1_7: # %udiv-loop-exit
+; X64-NEXT:    shldq $1, %rdx, %r10
+; X64-NEXT:    shldq $1, %rbx, %rdx
+; X64-NEXT:    shldq $1, %rsi, %rbx
+; X64-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:  .LBB1_11: # %udiv-end
+; X64-NEXT:    xorq %r11, %r10
+; X64-NEXT:    xorq %r11, %rdx
+; X64-NEXT:    xorq %r11, %rbx
+; X64-NEXT:    xorq %r11, %rcx
+; X64-NEXT:    subq %r11, %rcx
+; X64-NEXT:    sbbq %r11, %rbx
+; X64-NEXT:    sbbq %r11, %rdx
+; X64-NEXT:    sbbq %r11, %r10
+; X64-NEXT:    movq %rcx, (%rax)
+; X64-NEXT:    movq %rbx, 8(%rax)
+; X64-NEXT:    movq %rdx, 16(%rax)
+; X64-NEXT:    movq %r10, 24(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    movb $1, %cl
+; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:  .LBB1_9:
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    jmp .LBB1_7
+; X64-NEXT:  .LBB1_4:
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    jmp .LBB1_11
+  %r = sdiv i256 %a, %b
+  ret i256 %r
+}
+
+define i256 @urem256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: urem256:
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $288, %esp # imm = 0x120
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl 72(%ebp), %edi
+; X86-NEXT:    movl 56(%ebp), %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl 64(%ebp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 68(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    orl 60(%ebp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 40(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    orl 32(%ebp), %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    orl 36(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    orl 28(%ebp), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    sete %ch
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB2_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB2_3
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB2_3: # %_udiv-special-cases
+; X86-NEXT:    movl 64(%ebp), %esi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl 60(%ebp), %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    orl 72(%ebp), %ebx
+; X86-NEXT:    je .LBB2_7
+; X86-NEXT:    jmp .LBB2_8
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl 72(%ebp), %ebx
+; X86-NEXT:    jne .LBB2_8
+; X86-NEXT:  .LBB2_7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB2_8: # %_udiv-special-cases
+; X86-NEXT:    movl 56(%ebp), %esi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    jne .LBB2_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl 52(%ebp), %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB2_13
+; X86-NEXT:  .LBB2_12:
+; X86-NEXT:    bsrl %ebx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    jmp .LBB2_14
+; X86-NEXT:  .LBB2_9:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB2_12
+; X86-NEXT:  .LBB2_13: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:  .LBB2_14: # %_udiv-special-cases
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    orl 56(%ebp), %ebx
+; X86-NEXT:    jne .LBB2_16
+; X86-NEXT:  # %bb.15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB2_16: # %_udiv-special-cases
+; X86-NEXT:    movl 64(%ebp), %edi
+; X86-NEXT:    orl 72(%ebp), %edi
+; X86-NEXT:    movl 60(%ebp), %ebx
+; X86-NEXT:    orl 68(%ebp), %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB2_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB2_18: # %_udiv-special-cases
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    jne .LBB2_19
+; X86-NEXT:  # %bb.20: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB2_23
+; X86-NEXT:  .LBB2_22:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    je .LBB2_25
+; X86-NEXT:    jmp .LBB2_26
+; X86-NEXT:  .LBB2_19:
+; X86-NEXT:    bsrl %edx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB2_22
+; X86-NEXT:  .LBB2_23: # %_udiv-special-cases
+; X86-NEXT:    bsrl 28(%ebp), %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB2_26
+; X86-NEXT:  .LBB2_25: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB2_26: # %_udiv-special-cases
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB2_27
+; X86-NEXT:  # %bb.28: # %_udiv-special-cases
+; X86-NEXT:    bsrl 20(%ebp), %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    jmp .LBB2_29
+; X86-NEXT:  .LBB2_27:
+; X86-NEXT:    bsrl %edx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:  .LBB2_29: # %_udiv-special-cases
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB2_30
+; X86-NEXT:  # %bb.31: # %_udiv-special-cases
+; X86-NEXT:    bsrl 12(%ebp), %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:    jmp .LBB2_32
+; X86-NEXT:  .LBB2_30:
+; X86-NEXT:    bsrl %edx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:  .LBB2_32: # %_udiv-special-cases
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    orl 24(%ebp), %edx
+; X86-NEXT:    jne .LBB2_34
+; X86-NEXT:  # %bb.33: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:  .LBB2_34: # %_udiv-special-cases
+; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    orl 40(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    orl 36(%ebp), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB2_36
+; X86-NEXT:  # %bb.35: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB2_36: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_37
+; X86-NEXT:  # %bb.38: # %select.false.sink
+; X86-NEXT:    movl $255, %ecx
+; X86-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:  .LBB2_39: # %select.end
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jne .LBB2_41
+; X86-NEXT:  # %bb.40: # %select.end
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB2_41: # %select.end
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_42
+; X86-NEXT:  # %bb.48: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $255, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    je .LBB2_49
+; X86-NEXT:  # %bb.46: # %udiv-bb1
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $3, %dl
+; X86-NEXT:    andb $28, %dl
+; X86-NEXT:    negb %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    movl 264(%esp,%edx), %esi
+; X86-NEXT:    movl 268(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 260(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 256(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 252(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 248(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 240(%esp,%edx), %edi
+; X86-NEXT:    movl 244(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB2_47
+; X86-NEXT:  # %bb.43: # %udiv-preheader
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    movl 172(%esp,%esi,4), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 168(%esp,%esi,4), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 164(%esp,%esi,4), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 160(%esp,%esi,4), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 156(%esp,%esi,4), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 152(%esp,%esi,4), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%esi,4), %edi
+; X86-NEXT:    movl 148(%esp,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrdl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB2_44: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 72(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 68(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 64(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl 60(%ebp), %edi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 56(%ebp), %esi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl 52(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 48(%ebp), %eax
+; X86-NEXT:    andl 44(%ebp), %ecx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    jne .LBB2_44
+; X86-NEXT:  .LBB2_45: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal (%edx,%eax,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:  .LBB2_49: # %udiv-end
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebp), %eax
+; X86-NEXT:    imull %eax, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl 64(%ebp), %edi
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 72(%ebp), %ecx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl 60(%ebp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull 64(%ebp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull 64(%ebp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl 56(%ebp), %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull 48(%ebp)
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 12(%edx)
+; X86-NEXT:    movl %esi, 16(%edx)
+; X86-NEXT:    movl %edi, 20(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB2_37:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_39
+; X86-NEXT:  .LBB2_47:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB2_45
+; X86-NEXT:  .LBB2_42:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_49
+;
+; X64-LABEL: urem256:
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 24(%rbp), %r14
+; X64-NEXT:    movq 32(%rbp), %r15
+; X64-NEXT:    movq 16(%rbp), %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    orq %r15, %rax
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    orq %r14, %rdi
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    sete %dil
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    orq %rcx, %r10
+; X64-NEXT:    orq %rax, %r10
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %dil, %al
+; X64-NEXT:    bsrq %r15, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %r14, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    orq $64, %r11
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovneq %rdi, %r11
+; X64-NEXT:    bsrq %rbx, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bsrq %r9, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    cmovneq %rdi, %r10
+; X64-NEXT:    orq $128, %r10
+; X64-NEXT:    movq %r14, %rdi
+; X64-NEXT:    orq %r15, %rdi
+; X64-NEXT:    cmovneq %r11, %r10
+; X64-NEXT:    bsrq %r8, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %rcx, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    orq $64, %r11
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    cmovneq %rdi, %r11
+; X64-NEXT:    bsrq %rdx, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    bsrq %rsi, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    orq $64, %rdi
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %r9, %rdi
+; X64-NEXT:    orq $128, %rdi
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r8, %rcx
+; X64-NEXT:    cmovneq %r11, %rdi
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    subq %rdi, %r10
+; X64-NEXT:    movl $0, %r12d
+; X64-NEXT:    sbbq %r12, %r12
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    sbbq %r14, %r14
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB2_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl $255, %ecx
+; X64-NEXT:    cmpq %r10, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r12, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r14, %rcx
+; X64-NEXT:    sbbq %rbx, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB2_3: # %select.end
+; X64-NEXT:    xorl %r11d, %r11d
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    cmovneq %r11, %rdi
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    cmovneq %r11, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    cmovneq %r11, %rax
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    cmoveq %r8, %r11
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jne .LBB2_4
+; X64-NEXT:  # %bb.10: # %select.end
+; X64-NEXT:    movq %r10, %rdx
+; X64-NEXT:    xorq $255, %rdx
+; X64-NEXT:    orq %r14, %rdx
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    orq %rbx, %rsi
+; X64-NEXT:    orq %rdx, %rsi
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT:    je .LBB2_11
+; X64-NEXT:  # %bb.8: # %udiv-bb1
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r10d, %ecx
+; X64-NEXT:    notb %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %rdx
+; X64-NEXT:    movq 240(%rsp,%rdx), %r11
+; X64-NEXT:    movq 248(%rsp,%rdx), %rax
+; X64-NEXT:    shldq %cl, %r11, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rsi
+; X64-NEXT:    movq 224(%rsp,%rdx), %r8
+; X64-NEXT:    movq 232(%rsp,%rdx), %r9
+; X64-NEXT:    shldq %cl, %r9, %r11
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    shldq %cl, %r8, %r9
+; X64-NEXT:    shlq %cl, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq $1, %r10
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %rbx, (%rsp) # 8-byte Spill
+; X64-NEXT:    jb .LBB2_9
+; X64-NEXT:  # %bb.5: # %udiv-preheader
+; X64-NEXT:    movq %r14, %r12
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r10d, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq 152(%rsp,%rdx,8), %rsi
+; X64-NEXT:    movq 144(%rsp,%rdx,8), %rdi
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    movl %r10d, %ecx
+; X64-NEXT:    shrdq %cl, %rsi, %r8
+; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    movq 128(%rsp,%rdx,8), %r10
+; X64-NEXT:    movq 136(%rsp,%rdx,8), %rdx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-NEXT:    shrq %cl, %rsi
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrdq %cl, %rdx, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%rbp), %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 24(%rbp), %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB2_6: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %r8, %rsi
+; X64-NEXT:    shldq $1, %rbx, %r8
+; X64-NEXT:    shldq $1, %r10, %rbx
+; X64-NEXT:    shldq $1, %r11, %r10
+; X64-NEXT:    shldq $1, %rax, %r11
+; X64-NEXT:    orq %r14, %r11
+; X64-NEXT:    shldq $1, %r9, %rax
+; X64-NEXT:    orq %rdx, %rax
+; X64-NEXT:    shldq $1, %r12, %r9
+; X64-NEXT:    orq %rdi, %r9
+; X64-NEXT:    addq %r12, %r12
+; X64-NEXT:    orq %rcx, %r12
+; X64-NEXT:    cmpq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rbx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r8, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    andq %r15, %rdi
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq 24(%rbp), %r15
+; X64-NEXT:    andq %r15, %r14
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    andq 16(%rbp), %r13
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    subq %rdx, %r10
+; X64-NEXT:    sbbq %r13, %rbx
+; X64-NEXT:    movq 32(%rbp), %r15
+; X64-NEXT:    sbbq %r14, %r8
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    addq $-1, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r14
+; X64-NEXT:    movq (%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r13
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, (%rsp) # 8-byte Spill
+; X64-NEXT:    orq %r13, %rdx
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r14, %rdi
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %edx
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    jne .LBB2_6
+; X64-NEXT:  .LBB2_7: # %udiv-loop-exit
+; X64-NEXT:    shldq $1, %rax, %r11
+; X64-NEXT:    shldq $1, %r9, %rax
+; X64-NEXT:    shldq $1, %r12, %r9
+; X64-NEXT:    leaq (%rcx,%r12,2), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:  .LBB2_11: # %udiv-end
+; X64-NEXT:    movq 16(%rbp), %r10
+; X64-NEXT:    movq %r10, %rsi
+; X64-NEXT:    imulq %rax, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    imulq %rbx, %r11
+; X64-NEXT:    addq %rdx, %r11
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq 24(%rbp), %rsi
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %r9, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    addq %r15, %r14
+; X64-NEXT:    adcq %r11, %rsi
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r12, %rdi
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    adcq %r13, %rbx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movzbl %sil, %esi
+; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq %r15, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    subq %r11, %rsi
+; X64-NEXT:    sbbq %rcx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rsi, (%rax)
+; X64-NEXT:    movq %r8, 8(%rax)
+; X64-NEXT:    movq %rdi, 16(%rax)
+; X64-NEXT:    movq %rcx, 24(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB2_3
+; X64-NEXT:  .LBB2_9:
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    jmp .LBB2_7
+; X64-NEXT:  .LBB2_4:
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT:    jmp .LBB2_11
+  %r = urem i256 %a, %b
+  ret i256 %r
+}
+
+define i256 @srem256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: srem256:
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $368, %esp # imm = 0x170
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 72(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB3_3
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:  .LBB3_3: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB3_6
+; X86-NEXT:  .LBB3_4:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB3_6: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB3_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB3_8: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jne .LBB3_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB3_13
+; X86-NEXT:  .LBB3_12:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    jmp .LBB3_14
+; X86-NEXT:  .LBB3_9:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB3_12
+; X86-NEXT:  .LBB3_13: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:  .LBB3_14: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB3_16
+; X86-NEXT:  # %bb.15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB3_16: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB3_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB3_18: # %_udiv-special-cases
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_19
+; X86-NEXT:  # %bb.20: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB3_21
+; X86-NEXT:  .LBB3_19:
+; X86-NEXT:    bsrl %edx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB3_21: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB3_22
+; X86-NEXT:  # %bb.23: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    je .LBB3_25
+; X86-NEXT:    jmp .LBB3_26
+; X86-NEXT:  .LBB3_22:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB3_26
+; X86-NEXT:  .LBB3_25: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB3_26: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB3_27
+; X86-NEXT:  # %bb.28: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    bsrl %edx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB3_31
+; X86-NEXT:  .LBB3_30:
+; X86-NEXT:    bsrl %ecx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    jmp .LBB3_32
+; X86-NEXT:  .LBB3_27:
+; X86-NEXT:    bsrl %edx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB3_30
+; X86-NEXT:  .LBB3_31: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:  .LBB3_32: # %_udiv-special-cases
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB3_34
+; X86-NEXT:  # %bb.33: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:  .LBB3_34: # %_udiv-special-cases
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB3_36
+; X86-NEXT:  # %bb.35: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:  .LBB3_36: # %_udiv-special-cases
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_37
+; X86-NEXT:  # %bb.38: # %select.false.sink
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $255, %ecx
+; X86-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:  .LBB3_39: # %select.end
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jne .LBB3_41
+; X86-NEXT:  # %bb.40: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB3_41: # %select.end
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_42
+; X86-NEXT:  # %bb.48: # %select.end
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $255, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    je .LBB3_49
+; X86-NEXT:  # %bb.46: # %udiv-bb1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $3, %dl
+; X86-NEXT:    andb $28, %dl
+; X86-NEXT:    negb %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    movl 344(%esp,%edx), %esi
+; X86-NEXT:    movl 348(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 340(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 336(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 332(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 328(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 320(%esp,%edx), %edi
+; X86-NEXT:    movl 324(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB3_47
+; X86-NEXT:  # %bb.43: # %udiv-preheader
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 252(%esp,%eax,4), %esi
+; X86-NEXT:    movl 248(%esp,%eax,4), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 244(%esp,%eax,4), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 240(%esp,%eax,4), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 236(%esp,%eax,4), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 232(%esp,%eax,4), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 224(%esp,%eax,4), %edi
+; X86-NEXT:    movl 228(%esp,%eax,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB3_44: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_44
+; X86-NEXT:  .LBB3_45: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    leal (%edi,%esi,2), %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:  .LBB3_49: # %udiv-end
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 12(%edx)
+; X86-NEXT:    movl %edi, 16(%edx)
+; X86-NEXT:    movl %esi, 20(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB3_37:
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jmp .LBB3_39
+; X86-NEXT:  .LBB3_47:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB3_45
+; X86-NEXT:  .LBB3_42:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB3_49
+;
+; X64-LABEL: srem256:
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $320, %rsp # imm = 0x140
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq 32(%rbp), %r15
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    xorq %rax, %r8
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    xorq %rax, %rdx
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    subq %rax, %rsi
+; X64-NEXT:    sbbq %rax, %rdx
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    xorq %rax, %r15
+; X64-NEXT:    movq 24(%rbp), %r13
+; X64-NEXT:    xorq %rax, %r13
+; X64-NEXT:    movq 16(%rbp), %r10
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    xorq %rax, %r9
+; X64-NEXT:    subq %rax, %r9
+; X64-NEXT:    sbbq %rax, %r10
+; X64-NEXT:    sbbq %rax, %r13
+; X64-NEXT:    sbbq %rax, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    orq %r15, %rax
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    orq %r13, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    sete %cl
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    orq %r14, %r11
+; X64-NEXT:    orq %rax, %r11
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    bsrq %r15, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    bsrq %r13, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    orq $64, %r11
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovneq %rcx, %r11
+; X64-NEXT:    bsrq %r10, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bsrq %r9, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    orq $64, %rbx
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    testq %r10, %r10
+; X64-NEXT:    cmovneq %rcx, %rbx
+; X64-NEXT:    orq $128, %rbx
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    orq %r15, %rcx
+; X64-NEXT:    cmovneq %r11, %rbx
+; X64-NEXT:    bsrq %r8, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    bsrq %r14, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    orq $64, %r9
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    cmovneq %rcx, %r9
+; X64-NEXT:    bsrq %rdx, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    bsrq %rsi, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    orq $64, %r11
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %rcx, %r11
+; X64-NEXT:    orq $128, %r11
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    orq %r8, %rcx
+; X64-NEXT:    cmovneq %r9, %r11
+; X64-NEXT:    xorl %r10d, %r10d
+; X64-NEXT:    subq %r11, %rbx
+; X64-NEXT:    movl $0, %r11d
+; X64-NEXT:    sbbq %r11, %r11
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    sbbq %r15, %r15
+; X64-NEXT:    sbbq %r10, %r10
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB3_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl $255, %ecx
+; X64-NEXT:    cmpq %rbx, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r11, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r15, %rcx
+; X64-NEXT:    sbbq %r10, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB3_3: # %select.end
+; X64-NEXT:    xorl %r12d, %r12d
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    cmovneq %r12, %r9
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    cmovneq %r12, %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmovneq %r12, %rax
+; X64-NEXT:    cmoveq %r8, %r12
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jne .LBB3_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorq $255, %rdx
+; X64-NEXT:    orq %r15, %rdx
+; X64-NEXT:    movq %r11, %rsi
+; X64-NEXT:    orq %r10, %rsi
+; X64-NEXT:    orq %rdx, %rsi
+; X64-NEXT:    je .LBB3_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ebx, %ecx
+; X64-NEXT:    notb %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %rdx
+; X64-NEXT:    movq 272(%rsp,%rdx), %rax
+; X64-NEXT:    movq 280(%rsp,%rdx), %r12
+; X64-NEXT:    shldq %cl, %rax, %r12
+; X64-NEXT:    movq 256(%rsp,%rdx), %rsi
+; X64-NEXT:    movq 264(%rsp,%rdx), %r9
+; X64-NEXT:    shldq %cl, %r9, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shldq %cl, %rsi, %r9
+; X64-NEXT:    shlq %cl, %rsi
+; X64-NEXT:    addq $1, %rbx
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jb .LBB3_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ebx, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq 184(%rsp,%rdx,8), %r11
+; X64-NEXT:    movq 176(%rsp,%rdx,8), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %ebx, %ecx
+; X64-NEXT:    shrdq %cl, %r11, %r10
+; X64-NEXT:    movq 160(%rsp,%rdx,8), %r8
+; X64-NEXT:    movq 168(%rsp,%rdx,8), %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-NEXT:    shrq %cl, %r11
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrdq %cl, %rdx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB3_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %r10, %r11
+; X64-NEXT:    shldq $1, %rbx, %r10
+; X64-NEXT:    shldq $1, %r8, %rbx
+; X64-NEXT:    shldq $1, %r12, %r8
+; X64-NEXT:    shldq $1, %rax, %r12
+; X64-NEXT:    orq %r14, %r12
+; X64-NEXT:    shldq $1, %r9, %rax
+; X64-NEXT:    orq %rdx, %rax
+; X64-NEXT:    shldq $1, %rsi, %r9
+; X64-NEXT:    orq %rdi, %r9
+; X64-NEXT:    addq %rsi, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    cmpq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rbx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r10, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %r11, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    andq %r13, %r14
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r13, %r15
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    subq %rdx, %r8
+; X64-NEXT:    sbbq %r13, %rbx
+; X64-NEXT:    movq %r15, %r13
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    sbbq %r14, %r10
+; X64-NEXT:    sbbq %rdi, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    addq $-1, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r14
+; X64-NEXT:    adcq $-1, %r15
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r15, %rdx
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r14, %rdi
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %edx
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    jne .LBB3_7
+; X64-NEXT:  .LBB3_8: # %udiv-loop-exit
+; X64-NEXT:    shldq $1, %rax, %r12
+; X64-NEXT:    shldq $1, %r9, %rax
+; X64-NEXT:    shldq $1, %rsi, %r9
+; X64-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:  .LBB3_9: # %udiv-end
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    imulq %r10, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    imulq %rbx, %r12
+; X64-NEXT:    addq %rdx, %r12
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    addq %rdx, %rax
+; X64-NEXT:    imulq %r9, %r13
+; X64-NEXT:    addq %rax, %r13
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq %r12, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r12, %rdi
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    adcq %r13, %r11
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    movzbl %sil, %esi
+; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    sbbq %rcx, %rsi
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    sbbq %rdx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    xorq %rax, %r8
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    sbbq %rax, %rsi
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    movq %rdi, (%r15)
+; X64-NEXT:    movq %rsi, 8(%r15)
+; X64-NEXT:    movq %r14, 16(%r15)
+; X64-NEXT:    movq %r8, 24(%r15)
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB3_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB3_3
+; X64-NEXT:  .LBB3_10:
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    jmp .LBB3_8
+  %r = srem i256 %a, %b
+  ret i256 %r
+}
+
+; Division by power of 2 should optimize to shift
+define i256 @udiv256_pow2(i256 %a) nounwind {
+; X86-LABEL: udiv256_pow2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    shldl $28, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $28, %ebx, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    shldl $28, %ecx, %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shldl $28, %edi, %esi
+; X86-NEXT:    shldl $28, %eax, %edi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl $28, %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrdl $4, %eax, %ecx
+; X86-NEXT:    shrl $4, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, 24(%eax)
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, 20(%eax)
+; X86-NEXT:    movl %ebx, 16(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: udiv256_pow2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrdq $4, %rdx, %rsi
+; X64-NEXT:    shrdq $4, %rcx, %rdx
+; X64-NEXT:    shrdq $4, %r8, %rcx
+; X64-NEXT:    shrq $4, %r8
+; X64-NEXT:    movq %r8, 24(%rdi)
+; X64-NEXT:    movq %rcx, 16(%rdi)
+; X64-NEXT:    movq %rdx, 8(%rdi)
+; X64-NEXT:    movq %rsi, (%rdi)
+; X64-NEXT:    retq
+  %r = udiv i256 %a, 16
+  ret i256 %r
+}
+
+; Division by constant
+define i256 @sdiv256_const(i256 %a) nounwind {
+; X86-LABEL: sdiv256_const:
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $288, %esp # imm = 0x120
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_3
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_3: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jne .LBB5_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_6
+; X86-NEXT:  .LBB5_4:
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_6: # %_udiv-special-cases
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    jne .LBB5_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_8: # %_udiv-special-cases
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB5_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jmp .LBB5_11
+; X86-NEXT:  .LBB5_9:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:  .LBB5_11: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB5_12
+; X86-NEXT:  # %bb.13: # %_udiv-special-cases
+; X86-NEXT:    bsrl %esi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    jmp .LBB5_14
+; X86-NEXT:  .LBB5_12:
+; X86-NEXT:    bsrl %ecx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:  .LBB5_14: # %_udiv-special-cases
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    jne .LBB5_16
+; X86-NEXT:  # %bb.15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_16: # %_udiv-special-cases
+; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jne .LBB5_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl $128, %eax
+; X86-NEXT:  .LBB5_18: # %_udiv-special-cases
+; X86-NEXT:    movl $253, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    je .LBB5_19
+; X86-NEXT:  # %bb.20: # %select.false.sink
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $255, %edx
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB5_21: # %select.end
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jne .LBB5_23
+; X86-NEXT:  # %bb.22: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:  .LBB5_23: # %select.end
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jne .LBB5_24
+; X86-NEXT:  # %bb.30: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    xorl $255, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    je .LBB5_31
+; X86-NEXT:  # %bb.28: # %udiv-bb1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $3, %dl
+; X86-NEXT:    andb $28, %dl
+; X86-NEXT:    negb %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    movl 264(%esp,%edx), %esi
+; X86-NEXT:    movl 268(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 260(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 256(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 252(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 248(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 240(%esp,%edx), %eax
+; X86-NEXT:    movl 244(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB5_29
+; X86-NEXT:  # %bb.25: # %udiv-preheader
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %ebx
+; X86-NEXT:    movl 172(%esp,%ebx,4), %esi
+; X86-NEXT:    movl 168(%esp,%ebx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 164(%esp,%ebx,4), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 160(%esp,%ebx,4), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 156(%esp,%ebx,4), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 152(%esp,%ebx,4), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%ebx,4), %edi
+; X86-NEXT:    movl 148(%esp,%ebx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $7, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB5_26: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $7, %edi
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    jne .LBB5_26
+; X86-NEXT:  .LBB5_27: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    leal (%esi,%eax,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_31: # %udiv-end
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl %ecx, 4(%edx)
+; X86-NEXT:    movl %esi, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %ebx, 16(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 24(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB5_19:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB5_21
+; X86-NEXT:  .LBB5_29:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB5_27
+; X86-NEXT:  .LBB5_24:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB5_31
+;
+; X64-LABEL: sdiv256_const:
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    xorq %rax, %r8
+; X64-NEXT:    xorq %rax, %r9
+; X64-NEXT:    xorq %rax, %rdx
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    subq %rax, %rsi
+; X64-NEXT:    sbbq %rax, %rdx
+; X64-NEXT:    sbbq %rax, %r9
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    bsrq %r8, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %r9, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    cmovneq %rdi, %r10
+; X64-NEXT:    bsrq %rdx, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    bsrq %rsi, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    orq $64, %r11
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %rdi, %r11
+; X64-NEXT:    orq $128, %r11
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    cmovneq %r10, %r11
+; X64-NEXT:    movl $253, %r15d
+; X64-NEXT:    subq %r11, %r15
+; X64-NEXT:    movl $0, %r11d
+; X64-NEXT:    movl $0, %r13d
+; X64-NEXT:    sbbq %r13, %r13
+; X64-NEXT:    movl $0, %ebx
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    sbbq %r11, %r11
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    je .LBB5_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl $255, %ecx
+; X64-NEXT:    cmpq %r15, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r13, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %rbx, %rcx
+; X64-NEXT:    sbbq %r11, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB5_3: # %select.end
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    cmovneq %r14, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    cmovneq %r14, %r10
+; X64-NEXT:    movq %r9, %r12
+; X64-NEXT:    cmovneq %r14, %r12
+; X64-NEXT:    cmoveq %r8, %r14
+; X64-NEXT:    jne .LBB5_4
+; X64-NEXT:  # %bb.10: # %select.end
+; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    xorq $255, %rcx
+; X64-NEXT:    orq %rbx, %rcx
+; X64-NEXT:    movq %r13, %rdi
+; X64-NEXT:    orq %r11, %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    je .LBB5_11
+; X64-NEXT:  # %bb.8: # %udiv-bb1
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r15d, %ecx
+; X64-NEXT:    notb %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %rax
+; X64-NEXT:    movq 208(%rsp,%rax), %r12
+; X64-NEXT:    movq 216(%rsp,%rax), %r14
+; X64-NEXT:    shldq %cl, %r12, %r14
+; X64-NEXT:    movq 192(%rsp,%rax), %r10
+; X64-NEXT:    movq 200(%rsp,%rax), %rax
+; X64-NEXT:    shldq %cl, %rax, %r12
+; X64-NEXT:    shldq %cl, %r10, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shlq %cl, %r10
+; X64-NEXT:    addq $1, %r15
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    jb .LBB5_9
+; X64-NEXT:  # %bb.5: # %udiv-preheader
+; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %r11, %r13
+; X64-NEXT:    movl $7, %r11d
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movq 120(%rsp,%rax,8), %rdx
+; X64-NEXT:    movq 112(%rsp,%rax,8), %rdi
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    shrdq %cl, %rdx, %rsi
+; X64-NEXT:    movq 96(%rsp,%rax,8), %r8
+; X64-NEXT:    movq 104(%rsp,%rax,8), %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    shrdq %cl, %rdi, %r9
+; X64-NEXT:    shrq %cl, %rdx
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrdq %cl, %rax, %r8
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    addq $-1, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB5_6: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %rsi, %rdx
+; X64-NEXT:    shldq $1, %r9, %rsi
+; X64-NEXT:    shldq $1, %r8, %r9
+; X64-NEXT:    shldq $1, %r14, %r8
+; X64-NEXT:    shldq $1, %r12, %r14
+; X64-NEXT:    orq %rdi, %r14
+; X64-NEXT:    shldq $1, %r11, %r12
+; X64-NEXT:    orq %rax, %r12
+; X64-NEXT:    shldq $1, %r10, %r11
+; X64-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    addq %r10, %r10
+; X64-NEXT:    orq %rcx, %r10
+; X64-NEXT:    cmpq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %r9, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rsi, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movl $7, %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    subq %rax, %r8
+; X64-NEXT:    sbbq $0, %r9
+; X64-NEXT:    sbbq $0, %rsi
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    addq $-1, %rdi
+; X64-NEXT:    adcq $-1, %r15
+; X64-NEXT:    adcq $-1, %rbx
+; X64-NEXT:    adcq $-1, %r13
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    orq %r13, %rax
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rbx, %rdi
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    jne .LBB5_6
+; X64-NEXT:  .LBB5_7: # %udiv-loop-exit
+; X64-NEXT:    shldq $1, %r12, %r14
+; X64-NEXT:    shldq $1, %r11, %r12
+; X64-NEXT:    shldq $1, %r10, %r11
+; X64-NEXT:    leaq (%rcx,%r10,2), %r10
+; X64-NEXT:    movq %r11, %rcx
+; X64-NEXT:  .LBB5_11: # %udiv-end
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    xorq %rax, %r12
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    subq %rax, %r10
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    sbbq %rax, %r12
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %r10, (%rax)
+; X64-NEXT:    movq %rcx, 8(%rax)
+; X64-NEXT:    movq %r12, 16(%rax)
+; X64-NEXT:    movq %r14, 24(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB5_3
+; X64-NEXT:  .LBB5_9:
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    jmp .LBB5_7
+; X64-NEXT:  .LBB5_4:
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    jmp .LBB5_11
+  %r = sdiv i256 %a, 7
+  ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
index a635d55d2033d..505077f5df5f5 100644
--- a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
+++ b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
@@ -6,242 +6,12 @@
 ; Function Attrs: noinline optnone
 define double @main(i224 %0) #0 {
 ; CHECK-LABEL: main:
-; CHECK:       # %bb.0: # %entryitofp-entry
-; CHECK-NEXT:    pushq %rbp
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %r15
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    pushq %r13
-; CHECK-NEXT:    .cfi_def_cfa_offset 40
-; CHECK-NEXT:    pushq %r12
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 56
-; CHECK-NEXT:    subq $88, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 144
-; CHECK-NEXT:    .cfi_offset %rbx, -56
-; CHECK-NEXT:    .cfi_offset %r12, -48
-; CHECK-NEXT:    .cfi_offset %r13, -40
-; CHECK-NEXT:    .cfi_offset %r14, -32
-; CHECK-NEXT:    .cfi_offset %r15, -24
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    movq %rdi, %r8
-; CHECK-NEXT:    orq %rdx, %r8
-; CHECK-NEXT:    movq %rsi, %r9
-; CHECK-NEXT:    orq %rax, %r9
-; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    orq %r9, %r8
-; CHECK-NEXT:    je .LBB0_10
-; CHECK-NEXT:    jmp .LBB0_1
-; CHECK-NEXT:  .LBB0_1: # %itofp-if-end
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    movq %rax, %r9
-; CHECK-NEXT:    sarq $31, %r9
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    xorq %rax, %rcx
-; CHECK-NEXT:    xorq %rax, %rdx
-; CHECK-NEXT:    xorq %rax, %rsi
-; CHECK-NEXT:    xorq %r9, %rdi
-; CHECK-NEXT:    subq %r9, %rdi
-; CHECK-NEXT:    sbbq %rax, %rsi
-; CHECK-NEXT:    sbbq %rax, %rdx
-; CHECK-NEXT:    sbbq %rax, %rcx
-; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    shldq $32, %rdx, %r8
-; CHECK-NEXT:    bsrq %r8, %rax
-; CHECK-NEXT:    xorl $63, %eax
-; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    shldq $32, %rsi, %r10
-; CHECK-NEXT:    bsrq %r10, %r11
-; CHECK-NEXT:    xorl $63, %r11d
-; CHECK-NEXT:    orl $64, %r11d
-; CHECK-NEXT:    testq %r8, %r8
-; CHECK-NEXT:    cmovnel %eax, %r11d
-; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    shldq $32, %rdi, %rbx
-; CHECK-NEXT:    bsrq %rbx, %r14
-; CHECK-NEXT:    xorl $63, %r14d
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    shlq $32, %rax
-; CHECK-NEXT:    bsrq %rax, %rax
-; CHECK-NEXT:    xorl $63, %eax
-; CHECK-NEXT:    orl $64, %eax
-; CHECK-NEXT:    testq %rbx, %rbx
-; CHECK-NEXT:    cmovnel %r14d, %eax
-; CHECK-NEXT:    subl $-128, %eax
-; CHECK-NEXT:    orq %r8, %r10
-; CHECK-NEXT:    cmovnel %r11d, %eax
-; CHECK-NEXT:    movl $224, %r11d
-; CHECK-NEXT:    subl %eax, %r11d
-; CHECK-NEXT:    movl $223, %r10d
-; CHECK-NEXT:    subl %eax, %r10d
-; CHECK-NEXT:    cmpl $53, %r11d
-; CHECK-NEXT:    jle .LBB0_8
-; CHECK-NEXT:  # %bb.2: # %itofp-if-then4
-; CHECK-NEXT:    movl %r11d, %r8d
-; CHECK-NEXT:    subl $54, %r8d
-; CHECK-NEXT:    je .LBB0_4
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_3: # %itofp-if-then4
-; CHECK-NEXT:    movl %r11d, %r8d
-; CHECK-NEXT:    subl $55, %r8d
-; CHECK-NEXT:    jne .LBB0_5
-; CHECK-NEXT:  # %bb.11:
-; CHECK-NEXT:    jmp .LBB0_6
-; CHECK-NEXT:  .LBB0_4: # %itofp-sw-bb
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shldq $1, %rdi, %rax
-; CHECK-NEXT:    movq %rdx, %r8
-; CHECK-NEXT:    shldq $1, %rsi, %r8
-; CHECK-NEXT:    shldq $1, %rdx, %rcx
-; CHECK-NEXT:    addq %rdi, %rdi
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    movq %r8, %rdx
-; CHECK-NEXT:    jmp .LBB0_6
-; CHECK-NEXT:  .LBB0_5: # %itofp-sw-default
-; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movl %ecx, %r8d
-; CHECK-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $-87, %r8b
-; CHECK-NEXT:    subb %al, %r8b
-; CHECK-NEXT:    movb %r8b, %bl
-; CHECK-NEXT:    shrb $6, %bl
-; CHECK-NEXT:    movzbl %bl, %r12d
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, (%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq -24(%rsp,%r12,8), %rbx
-; CHECK-NEXT:    movq -32(%rsp,%r12,8), %r13
-; CHECK-NEXT:    movq %rcx, %rbp
-; CHECK-NEXT:    movb %r8b, %cl
-; CHECK-NEXT:    movq %r13, %r14
-; CHECK-NEXT:    shrdq %cl, %rbx, %r14
-; CHECK-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq -48(%rsp,%r12,8), %r15
-; CHECK-NEXT:    movq -40(%rsp,%r12,8), %r12
-; CHECK-NEXT:    movb %r8b, %cl
-; CHECK-NEXT:    movq %r12, %r14
-; CHECK-NEXT:    shrdq %cl, %r13, %r14
-; CHECK-NEXT:    movb %r8b, %cl
-; CHECK-NEXT:    shrq %cl, %rbx
-; CHECK-NEXT:    movb %r8b, %cl
-; CHECK-NEXT:    shrdq %cl, %r12, %r15
-; CHECK-NEXT:    addb $55, %al
-; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rbp, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    shrb $3, %cl
-; CHECK-NEXT:    andb $24, %cl
-; CHECK-NEXT:    negb %cl
-; CHECK-NEXT:    movsbq %cl, %rdx
-; CHECK-NEXT:    movq -80(%rsp,%rdx), %rsi
-; CHECK-NEXT:    movq -72(%rsp,%rdx), %rdi
-; CHECK-NEXT:    movq -64(%rsp,%rdx), %r8
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    movq %r8, %r12
-; CHECK-NEXT:    shldq %cl, %rdi, %r12
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    movq %rsi, %r13
-; CHECK-NEXT:    shlq %cl, %r13
-; CHECK-NEXT:    orq %r12, %r13
-; CHECK-NEXT:    movq -56(%rsp,%rdx), %rdx
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    shldq %cl, %r8, %rdx
-; CHECK-NEXT:    movl %edx, %edx
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    shldq %cl, %rsi, %rdi
-; CHECK-NEXT:    orq %rdx, %rdi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    orq %rdi, %r13
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    orq %rax, %r15
-; CHECK-NEXT:    movq %r15, %rdi
-; CHECK-NEXT:    movq %r14, %rsi
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-NEXT:    movq %rbx, %rcx
-; CHECK-NEXT:    jmp .LBB0_6
-; CHECK-NEXT:  .LBB0_6: # %itofp-sw-epilog
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    addq $1, %rdi
-; CHECK-NEXT:    adcq $0, %rsi
-; CHECK-NEXT:    adcq $0, %rdx
-; CHECK-NEXT:    adcq $0, %rcx
-; CHECK-NEXT:    movq %rsi, %rdx
-; CHECK-NEXT:    shldq $62, %rdi, %rdx
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    btq $55, %rdi
-; CHECK-NEXT:    jae .LBB0_9
-; CHECK-NEXT:    jmp .LBB0_7
-; CHECK-NEXT:  .LBB0_7: # %itofp-if-then20
-; CHECK-NEXT:    shldq $61, %rdi, %rsi
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movq %rsi, %rdx
-; CHECK-NEXT:    movl %r11d, %r10d
-; CHECK-NEXT:    jmp .LBB0_9
-; CHECK-NEXT:  .LBB0_8: # %itofp-if-else
-; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    addb $85, %al
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    shrb $3, %cl
-; CHECK-NEXT:    andb $24, %cl
-; CHECK-NEXT:    negb %cl
-; CHECK-NEXT:    movsbq %cl, %rcx
-; CHECK-NEXT:    movq 48(%rsp,%rcx), %rdx
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:  .LBB0_9: # %itofp-if-end26
-; CHECK-NEXT:    andl $-2147483648, %r9d # imm = 0x80000000
-; CHECK-NEXT:    shll $20, %r10d
-; CHECK-NEXT:    addl $1072693248, %r10d # imm = 0x3FF00000
-; CHECK-NEXT:    andl $1048575, %eax # imm = 0xFFFFF
-; CHECK-NEXT:    orl %r9d, %eax
-; CHECK-NEXT:    orl %r10d, %eax
-; CHECK-NEXT:    movl %eax, %eax
-; CHECK-NEXT:    shlq $32, %rax
-; CHECK-NEXT:    movabsq $4294967295, %rcx # imm = 0xFFFFFFFF
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rdx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:  .LBB0_10: # %itofp-return
-; CHECK-NEXT:    addq $88, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 56
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    popq %r12
-; CHECK-NEXT:    .cfi_def_cfa_offset 40
-; CHECK-NEXT:    popq %r13
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    movslq %ecx, %rcx
+; CHECK-NEXT:    callq __floatoidf at PLT
+; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/fp-i129.ll b/llvm/test/CodeGen/X86/fp-i129.ll
index c55c19abbd9b8..cf260b8cc4773 100644
--- a/llvm/test/CodeGen/X86/fp-i129.ll
+++ b/llvm/test/CodeGen/X86/fp-i129.ll
@@ -1,94 +1,136 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64
 
+; On i686, these are expanded inline. On x86_64 with MaxLargeFPConvertBitWidthSupported=256,
+; i129 is promoted to i256 and uses __fix*oi / __float*oi libcalls.
 define i129 @fptosi_float(float %a) nounwind {
-; CHECK-LABEL: fptosi_float:
-; CHECK-NOT:     call
+; X86-LABEL: fptosi_float:
+; X86-NOT:     __fixsfoi
+;
+; X64-LABEL: fptosi_float:
+; X64:         callq __fixsfoi at PLT
   %res = fptosi float %a to i129
   ret i129 %res
 }
 
 define i129 @fptosi_double(double %a) nounwind {
-; CHECK-LABEL: fptosi_double:
-; CHECK-NOT:     call
+; X86-LABEL: fptosi_double:
+; X86-NOT:     __fixdfoi
+;
+; X64-LABEL: fptosi_double:
+; X64:         callq __fixdfoi at PLT
   %res = fptosi double %a to i129
   ret i129 %res
 }
 
 define i129 @fptosi_fp128(fp128 %a) nounwind {
-; CHECK-LABEL: fptosi_fp128:
-; CHECK-NOT:     call
+; X86-LABEL: fptosi_fp128:
+; X86-NOT:     __fixtfoi
+;
+; X64-LABEL: fptosi_fp128:
+; X64:         callq __fixtfoi at PLT
   %res = fptosi fp128 %a to i129
   ret i129 %res
 }
 
 define i129 @fptoui_float(float %a) nounwind {
-; CHECK-LABEL: fptoui_float:
-; CHECK-NOT:     call
+; X86-LABEL: fptoui_float:
+; X86-NOT:     __fixunssfoi
+;
+; X64-LABEL: fptoui_float:
+; X64:         callq __fixunssfoi at PLT
   %res = fptoui float %a to i129
   ret i129 %res
 }
 
 define i129 @fptoui_double(double %a) nounwind {
-; CHECK-LABEL: fptoui_double:
-; CHECK-NOT:     call
+; X86-LABEL: fptoui_double:
+; X86-NOT:     __fixunsdfoi
+;
+; X64-LABEL: fptoui_double:
+; X64:         callq __fixunsdfoi at PLT
   %res = fptoui double %a to i129
   ret i129 %res
 }
 
 define i129 @fptoui_fp128(fp128 %a) nounwind {
-; CHECK-LABEL: fptoui_fp128:
-; CHECK-NOT:     call
+; X86-LABEL: fptoui_fp128:
+; X86-NOT:     __fixunstfoi
+;
+; X64-LABEL: fptoui_fp128:
+; X64:         callq __fixunstfoi at PLT
   %res = fptoui fp128 %a to i129
   ret i129 %res
 }
 
 define float @sitofp_float(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_float:
-; CHECK-NOT:     call
+; X86-LABEL: sitofp_float:
+; X86-NOT:     __floatoisf
+;
+; X64-LABEL: sitofp_float:
+; X64:         callq __floatoisf at PLT
   %res = sitofp i129 %a to float
   ret float %res
 }
 
 define double @sitofp_double(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_double:
-; CHECK-NOT:     call
+; X86-LABEL: sitofp_double:
+; X86-NOT:     __floatoidf
+;
+; X64-LABEL: sitofp_double:
+; X64:         callq __floatoidf at PLT
   %res = sitofp i129 %a to double
   ret double %res
 }
 
 define fp128 @sitofp_fp128(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_fp128:
-; CHECK-NOT:     call
+; X86-LABEL: sitofp_fp128:
+; X86-NOT:     __floatoitf
+;
+; X64-LABEL: sitofp_fp128:
+; X64:         callq __floatoitf at PLT
   %res = sitofp i129 %a to fp128
   ret fp128 %res
 }
 
 define float @uitofp_float(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_float:
-; CHECK-NOT:     call
+; X86-LABEL: uitofp_float:
+; X86-NOT:     __floatunoisf
+;
+; X64-LABEL: uitofp_float:
+; X64:         callq __floatunoisf at PLT
   %res = uitofp i129 %a to float
   ret float %res
 }
 
 define double @uitofp_double(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_double:
-; CHECK-NOT:     call
+; X86-LABEL: uitofp_double:
+; X86-NOT:     __floatunoidf
+;
+; X64-LABEL: uitofp_double:
+; X64:         callq __floatunoidf at PLT
   %res = uitofp i129 %a to double
   ret double %res
 }
 
 define fp128 @uitofp_fp128(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_fp128:
-; CHECK-NOT:     call
+; X86-LABEL: uitofp_fp128:
+; X86-NOT:     __floatunoitf
+;
+; X64-LABEL: uitofp_fp128:
+; X64:         callq __floatunoitf at PLT
   %res = uitofp i129 %a to fp128
   ret fp128 %res
 }
 
-; higher sizes
+; i257 is wider than MaxLargeFPConvertBitWidthSupported=256, so the FP conversion
+; is expanded inline. The inline expansion uses i256 multiply/shift libcalls.
 define i257 @fptosi257_double(double %a) nounwind {
-; CHECK-LABEL: fptosi257_double:
-; CHECK-NOT:     call
+; X86-LABEL: fptosi257_double:
+; X86-NOT:     __fixdfoi
+;
+; X64-LABEL: fptosi257_double:
+; X64-NOT:     __fixdfoi
   %res = fptosi double %a to i257
   ret i257 %res
 }
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index f78e34ef60569..0cbe783b69c0e 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -118,8 +118,330 @@ define i128 @test2(i128 %x) nounwind {
 
 define i128 @test3(i128 %x) nounwind {
 ; X86-LABEL: test3:
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
-; X86: udiv-do-while
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    subl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    jne .LBB2_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB2_3
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB2_3: # %_udiv-special-cases
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB2_6
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB2_6: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jne .LBB2_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_8: # %_udiv-special-cases
+; X86-NEXT:    movl $61, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    je .LBB2_9
+; X86-NEXT:  # %bb.10: # %select.false.sink
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB2_11: # %select.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jne .LBB2_13
+; X86-NEXT:  # %bb.12: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:  .LBB2_13: # %select.end
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    jne .LBB2_14
+; X86-NEXT:  # %bb.20: # %select.end
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    je .LBB2_21
+; X86-NEXT:  # %bb.18: # %udiv-bb1
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 136(%esp,%eax), %edx
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 132(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB2_19
+; X86-NEXT:  # %bb.15: # %udiv-preheader
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    movl 92(%esp,%edi), %eax
+; X86-NEXT:    movl 88(%esp,%edi), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edi), %ebx
+; X86-NEXT:    movl 84(%esp,%edi), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $3, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB2_16: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl $4, %ebx
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    movl $3, %ebx
+; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl $-1, %edi
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB2_16
+; X86-NEXT:  .LBB2_17: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    leal (%esi,%eax,2), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:  .LBB2_21: # %udiv-end
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB2_9:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_11
+; X86-NEXT:  .LBB2_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB2_17
+; X86-NEXT:  .LBB2_14:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_21
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
@@ -129,6 +451,7 @@ define i128 @test3(i128 %x) nounwind {
 ; X64-NEXT:    callq __divti3 at PLT
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
   %tmp = sdiv i128 %x, -73786976294838206467
   ret i128 %tmp
 }
diff --git a/llvm/test/CodeGen/X86/memfold-mov32r0.ll b/llvm/test/CodeGen/X86/memfold-mov32r0.ll
index f7cbf6c33c94c..985b8a597dee2 100644
--- a/llvm/test/CodeGen/X86/memfold-mov32r0.ll
+++ b/llvm/test/CodeGen/X86/memfold-mov32r0.ll
@@ -3,7 +3,7 @@
 ; CHECK:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 define i32 @test() nounwind {
 entry:
-  %div = udiv i256 0, 0
-  store i256 %div, ptr null, align 16
+  %div = udiv i512 0, 0
+  store i512 %div, ptr null, align 16
   ret i32 0
 }
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index bb93e34fda7c4..07096cd3482ec 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -4817,1126 +4817,668 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-LABEL: test_1024:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $240, %rsp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $1216, %rsp # imm = 0x4C0
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 40(%rdi), %rbx
-; X64-NEXT:    movq 32(%rdi), %r12
-; X64-NEXT:    movq 56(%rdi), %r15
-; X64-NEXT:    movq 48(%rdi), %r10
-; X64-NEXT:    movq (%rsi), %r11
-; X64-NEXT:    movq 8(%rsi), %r14
-; X64-NEXT:    movq %rsi, %r13
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r9, %r8
-; X64-NEXT:    adcq %rsi, %r10
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r9d
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r10, %rsi
-; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rbx, %r11
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r9, %r15
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq %r8, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%r13), %r8
-; X64-NEXT:    movq %r12, %r10
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 24(%r13), %rbp
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    addq %r12, %rax
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    adcq %r9, %r13
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r13, %r9
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    addq %r15, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %r12
-; X64-NEXT:    movq %r12, (%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq %rcx, %rdi
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rdi, %rbx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 24(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 96(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 104(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 112(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 120(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 64(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 72(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 80(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 88(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 32(%rdi), %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    movq 40(%rdi), %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq 16(%r14), %r11
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 24(%r14), %r8
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq 48(%rdi), %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 56(%rdi), %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq %rsi, %r15
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r15, %rdi
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq (%r14), %rbp
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq 96(%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq 8(%r14), %r14
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rsi, %r12
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movq 104(%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r14, %r15
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %r9, %r13
-; X64-NEXT:    adcq %rbx, %r12
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq 112(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 120(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 8(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 24(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 32(%rsi), %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 40(%rsi), %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 48(%rsi), %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 56(%rsi), %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r9, %rbp
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movzbl %r9b, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r13, %r10
+; X64-NEXT:    movq 64(%rsi), %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 72(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 80(%rsi), %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    adcq %rcx, %r15
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r8, %rdi
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rcx, %r12
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %rbp
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r12, %r11
-; X64-NEXT:    adcq %rdi, %r13
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r9
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movzbl %r10b, %ecx
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    movq 88(%rsi), %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq (%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    pushq %r11
+; X64-NEXT:    pushq %r10
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 32(%r8), %rcx
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 40(%r8), %rbx
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r11, %rsi
-; X64-NEXT:    adcq %rdi, %r15
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r15, %r11
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r15, %rbp
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r13, %r10
-; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r10, %rbp
-; X64-NEXT:    movzbl %bl, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r12, %rbp
-; X64-NEXT:    adcq %rsi, %r15
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 48(%r8), %rcx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %r12
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq 56(%r8), %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r13, %r9
-; X64-NEXT:    adcq %r10, %r14
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r15, %r9
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    addq %r11, %r13
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %r12, %r9
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rdi, %r14
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r14, %rbp
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r8, %rdi
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r13, %r12
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    movzbl %r11b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    movq %r13, %r12
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    movq %r15, %rbx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r12, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    adcq %rsi, %r10
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r10, %rsi
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r11, %r13
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, %r11
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %r13
-; X64-NEXT:    adcq %rbx, %r12
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %rbx
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r8, %r10
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    adcq %rbp, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %r9, %rbp
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    addq %r13, %r14
-; X64-NEXT:    movq %r14, %r13
-; X64-NEXT:    adcq %r12, %r11
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq %rcx, %r9
-; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    movq %r15, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq %rdi, %r13
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r14, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r14, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq 64(%r13), %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r13, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r14, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r13, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 72(%r13), %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    adcq %rdi, %r10
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r10, %r9
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r15, %rcx
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, %rcx
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r8, %rbp
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r11, %rbp
-; X64-NEXT:    adcq %rbx, %r15
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 80(%r13), %r14
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r8, %r11
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq 88(%r13), %rbx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    addq %r9, %r13
-; X64-NEXT:    adcq %rdi, %r12
-; X64-NEXT:    setb %bpl
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rdi, %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %r15
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    adcq %r8, %rdi
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %r10
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %bpl, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rbx
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rbx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r10, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %r11, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %r8, %rdi
-; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %rdi, %r14
-; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 112(%rcx), %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %r11, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq 120(%rcx), %rax
-; X64-NEXT:    imulq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r12
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    movq 96(%rcx), %r13
-; X64-NEXT:    movq 104(%rcx), %r8
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, %rbx
-; X64-NEXT:    imulq %r8, %rbx
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rbx, %rdx
-; X64-NEXT:    imulq %r13, %r9
-; X64-NEXT:    addq %rdx, %r9
-; X64-NEXT:    addq %rbp, %rdi
-; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    movq %r9, %r15
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r10, %r12
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    adcq %rdi, %r15
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movzbl %dil, %edi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    adcq %rsi, %rdi
+; X64-NEXT:    adcq %rdx, %r8
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r12, %r13
-; X64-NEXT:    adcq %rbp, %r10
-; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    movzbl %bl, %r8d
-; X64-NEXT:    adcq %r8, %rdx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    adcq %r15, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %rax
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq 80(%r14), %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 88(%r14), %r15
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    addq %rdx, %r12
+; X64-NEXT:    adcq %rsi, %r13
+; X64-NEXT:    setb %dl
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
 ; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r9, %rdi
-; X64-NEXT:    adcq %r8, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rcx, %r12
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    movq 64(%r14), %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq 72(%r14), %r8
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r11, %r14
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %r11
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r11, %rbp
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbp
-; X64-NEXT:    adcq %rdi, %rbx
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r8, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r10
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %rdi, %r13
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %rbp, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    addq %r12, %rdi
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r9, %rbx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r8, %r9
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rax, %r12
 ; X64-NEXT:    adcq %r11, %r13
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq %r9, %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq %rdi, %r9
+; X64-NEXT:    setb %cl
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    adcq %rdx, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    adcq %rsi, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 96(%rcx), %rsi
-; X64-NEXT:    imulq %rsi, %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r9, %rdx
-; X64-NEXT:    movq 104(%rcx), %r9
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    imulq %r9, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq 112(%rcx), %rax
-; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    adcq %rax, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq 120(%r14), %r13
-; X64-NEXT:    imulq %rbx, %r13
-; X64-NEXT:    addq %rdx, %r13
-; X64-NEXT:    addq %rdi, %r8
-; X64-NEXT:    adcq %r11, %r13
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbx, %r12
-; X64-NEXT:    adcq %r11, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq %r13, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    imulq %r8, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    imulq %rdi, %rbp
-; X64-NEXT:    addq %rdx, %rbp
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq %r13, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r15
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rcx, %r12
+; X64-NEXT:    adcq %rdx, %r15
+; X64-NEXT:    adcq %rsi, %r14
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rsi, %rcx
-; X64-NEXT:    adcq %r8, %rdi
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r12, %rcx
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    adcq %rcx, %r8
+; X64-NEXT:    setb %al
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movzbl %al, %r10d
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    adcq %r8, %rdx
+; X64-NEXT:    adcq %r10, %rcx
 ; X64-NEXT:    adcq %r9, %rax
-; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq %r12, %rdx
+; X64-NEXT:    adcq %r15, %rcx
+; X64-NEXT:    adcq %r14, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %rdi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, (%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 8(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 16(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 24(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 32(%rsi)
-; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 40(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 48(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 56(%rsi)
-; X64-NEXT:    movq %r8, 64(%rsi)
-; X64-NEXT:    movq %r9, 72(%rsi)
-; X64-NEXT:    movq %r10, 80(%rsi)
-; X64-NEXT:    movq %r11, 88(%rsi)
-; X64-NEXT:    movq %r13, 96(%rsi)
-; X64-NEXT:    movq %rcx, 104(%rsi)
-; X64-NEXT:    movq %rax, 112(%rsi)
-; X64-NEXT:    movq %rdx, 120(%rsi)
-; X64-NEXT:    addq $240, %rsp
+; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 16(%r9)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 24(%r9)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 32(%r9)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 40(%r9)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 48(%r9)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, 56(%r9)
+; X64-NEXT:    movq %r8, 64(%r9)
+; X64-NEXT:    movq %rdi, 72(%r9)
+; X64-NEXT:    movq %rbx, 80(%r9)
+; X64-NEXT:    movq %r11, 88(%r9)
+; X64-NEXT:    movq %rsi, 96(%r9)
+; X64-NEXT:    movq %rdx, 104(%r9)
+; X64-NEXT:    movq %rcx, 112(%r9)
+; X64-NEXT:    movq %rax, 120(%r9)
+; X64-NEXT:    movaps %xmm0, (%r9)
+; X64-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 2421aabdbcd99..b91a6e184c400 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1174,277 +1174,151 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-LABEL: test_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %rbx
-; X64-NEXT:    movq 8(%rdi), %rdi
-; X64-NEXT:    movq 24(%rax), %r14
-; X64-NEXT:    movq 16(%rax), %rax
-; X64-NEXT:    movq (%rsi), %r8
-; X64-NEXT:    movq 8(%rsi), %r11
-; X64-NEXT:    movq %rsi, %r13
-; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 32(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r10, %r15
-; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rbx, %rsi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq 40(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rcx, %r14
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    movq 48(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r15, %r14
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 56(%rdi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rdi), %r9
+; X64-NEXT:    movq 8(%rdi), %r10
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%rdi), %r12
+; X64-NEXT:    movq 24(%rdi), %r13
+; X64-NEXT:    movq 16(%rsi), %rax
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq 24(%rsi), %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rsi), %r15
+; X64-NEXT:    movq 8(%rsi), %r14
+; X64-NEXT:    movq 32(%rsi), %rax
+; X64-NEXT:    movq 40(%rsi), %rdx
+; X64-NEXT:    movq 48(%rsi), %rcx
+; X64-NEXT:    movq 56(%rsi), %r8
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %r9, %rbx
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %r10
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r13, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r13, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    movq %r12, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%r13), %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r12
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq 24(%rsi), %rsi
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rbp, %r11
-; X64-NEXT:    adcq %r15, %rcx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %rbx, %r13
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %r9, %rbp
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %rbp, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 32(%r8), %r15
-; X64-NEXT:    imulq %r15, %rsi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq 40(%r8), %rsi
-; X64-NEXT:    imulq %rsi, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq 48(%r8), %rax
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    imulq %r9, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 56(%r8), %r8
-; X64-NEXT:    imulq %r11, %r8
-; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r15, %r13
-; X64-NEXT:    adcq %rdi, %rcx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    adcq %r8, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 48(%r8), %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rsi
-; X64-NEXT:    addq %rdx, %rsi
-; X64-NEXT:    movq %r8, %rdx
-; X64-NEXT:    movq 56(%r8), %rax
-; X64-NEXT:    imulq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r8
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    movq 32(%rdx), %rbp
-; X64-NEXT:    movq 40(%rdx), %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    imulq %r9, %rdi
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %rbp, %r11
-; X64-NEXT:    addq %rdx, %r11
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rdi, %r8
-; X64-NEXT:    adcq %r15, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    adcq %rdi, %rdx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    adcq %r11, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %r8
-; X64-NEXT:    adcq %r10, %rax
-; X64-NEXT:    adcq %r12, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, (%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 8(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 16(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 24(%rcx)
-; X64-NEXT:    movq %rsi, 32(%rcx)
-; X64-NEXT:    movq %r8, 40(%rcx)
-; X64-NEXT:    movq %rax, 48(%rcx)
-; X64-NEXT:    movq %rdx, 56(%rcx)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movzbl %dil, %edi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    addq %rsi, %r11
+; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    adcq %r8, %r9
+; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movaps %xmm0, (%rdx)
+; X64-NEXT:    movq %rax, 16(%rdx)
+; X64-NEXT:    movq %rcx, 24(%rdx)
+; X64-NEXT:    movq %r11, 32(%rdx)
+; X64-NEXT:    movq %r14, 40(%rdx)
+; X64-NEXT:    movq %r10, 48(%rdx)
+; X64-NEXT:    movq %r9, 56(%rdx)
+; X64-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 28029793211f0..2b79d7e145368 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -12,141 +12,163 @@
 define i256 @test1(i256 %a) nounwind {
 ; ILP-LABEL: test1:
 ; ILP:       # %bb.0:
+; ILP-NEXT:    pushq %rbp
+; ILP-NEXT:    movq %rsp, %rbp
+; ILP-NEXT:    andq $-32, %rsp
+; ILP-NEXT:    subq $96, %rsp
 ; ILP-NEXT:    movq %rdi, %rax
 ; ILP-NEXT:    xorps %xmm0, %xmm0
-; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, (%rsp)
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
 ; ILP-NEXT:    addb $3, %cl
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
 ; ILP-NEXT:    andb $24, %dl
 ; ILP-NEXT:    negb %dl
-; ILP-NEXT:    movsbq %dl, %rdx
-; ILP-NEXT:    movq -24(%rsp,%rdx), %rsi
-; ILP-NEXT:    movq -16(%rsp,%rdx), %rdi
-; ILP-NEXT:    shldq %cl, %rsi, %rdi
-; ILP-NEXT:    movq -40(%rsp,%rdx), %r8
-; ILP-NEXT:    movq -32(%rsp,%rdx), %rdx
+; ILP-NEXT:    movsbq %dl, %rsi
+; ILP-NEXT:    movq 48(%rsp,%rsi), %rdx
+; ILP-NEXT:    movq 56(%rsp,%rsi), %rdi
+; ILP-NEXT:    shldq %cl, %rdx, %rdi
+; ILP-NEXT:    movq 32(%rsp,%rsi), %r8
+; ILP-NEXT:    movq 40(%rsp,%rsi), %rsi
 ; ILP-NEXT:    movq %r8, %r9
 ; ILP-NEXT:    shlq %cl, %r9
-; ILP-NEXT:    movq %rdx, %r10
+; ILP-NEXT:    movq %rsi, %r10
 ; ILP-NEXT:    shldq %cl, %r8, %r10
 ; ILP-NEXT:    movq %rdi, 24(%rax)
 ; ILP-NEXT:    movq %r10, 8(%rax)
 ; ILP-NEXT:    movq %r9, (%rax)
-; ILP-NEXT:    shlq %cl, %rsi
+; ILP-NEXT:    shlq %cl, %rdx
 ; ILP-NEXT:    notb %cl
-; ILP-NEXT:    shrq %rdx
+; ILP-NEXT:    shrq %rsi
 ; ILP-NEXT:    # kill: def $cl killed $cl killed $ecx
-; ILP-NEXT:    shrq %cl, %rdx
-; ILP-NEXT:    orq %rsi, %rdx
-; ILP-NEXT:    movq %rdx, 16(%rax)
+; ILP-NEXT:    shrq %cl, %rsi
+; ILP-NEXT:    orq %rdx, %rsi
+; ILP-NEXT:    movq %rsi, 16(%rax)
+; ILP-NEXT:    movq %rbp, %rsp
+; ILP-NEXT:    popq %rbp
 ; ILP-NEXT:    retq
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbp
+; HYBRID-NEXT:    movq %rsp, %rbp
+; HYBRID-NEXT:    andq $-32, %rsp
+; HYBRID-NEXT:    subq $96, %rsp
 ; HYBRID-NEXT:    movq %rdi, %rax
 ; HYBRID-NEXT:    xorps %xmm0, %xmm0
-; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, (%rsp)
+; HYBRID-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    leal (%rsi,%rsi), %ecx
 ; HYBRID-NEXT:    addb $3, %cl
 ; HYBRID-NEXT:    movl %ecx, %edx
 ; HYBRID-NEXT:    shrb $3, %dl
 ; HYBRID-NEXT:    andb $24, %dl
 ; HYBRID-NEXT:    negb %dl
-; HYBRID-NEXT:    movsbq %dl, %rdx
-; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rsi
-; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rdi
-; HYBRID-NEXT:    shldq %cl, %rsi, %rdi
+; HYBRID-NEXT:    movsbq %dl, %rsi
+; HYBRID-NEXT:    movq 48(%rsp,%rsi), %rdx
+; HYBRID-NEXT:    movq 56(%rsp,%rsi), %rdi
+; HYBRID-NEXT:    shldq %cl, %rdx, %rdi
 ; HYBRID-NEXT:    movq %rdi, 24(%rax)
-; HYBRID-NEXT:    movq -40(%rsp,%rdx), %rdi
-; HYBRID-NEXT:    movq -32(%rsp,%rdx), %rdx
-; HYBRID-NEXT:    movq %rdx, %r8
+; HYBRID-NEXT:    movq 32(%rsp,%rsi), %rdi
+; HYBRID-NEXT:    movq 40(%rsp,%rsi), %rsi
+; HYBRID-NEXT:    movq %rsi, %r8
 ; HYBRID-NEXT:    shldq %cl, %rdi, %r8
 ; HYBRID-NEXT:    movq %r8, 8(%rax)
 ; HYBRID-NEXT:    shlq %cl, %rdi
 ; HYBRID-NEXT:    movq %rdi, (%rax)
-; HYBRID-NEXT:    shlq %cl, %rsi
+; HYBRID-NEXT:    shlq %cl, %rdx
 ; HYBRID-NEXT:    notb %cl
-; HYBRID-NEXT:    shrq %rdx
+; HYBRID-NEXT:    shrq %rsi
 ; HYBRID-NEXT:    # kill: def $cl killed $cl killed $ecx
-; HYBRID-NEXT:    shrq %cl, %rdx
-; HYBRID-NEXT:    orq %rsi, %rdx
-; HYBRID-NEXT:    movq %rdx, 16(%rax)
+; HYBRID-NEXT:    shrq %cl, %rsi
+; HYBRID-NEXT:    orq %rdx, %rsi
+; HYBRID-NEXT:    movq %rsi, 16(%rax)
+; HYBRID-NEXT:    movq %rbp, %rsp
+; HYBRID-NEXT:    popq %rbp
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbp
+; BURR-NEXT:    movq %rsp, %rbp
+; BURR-NEXT:    andq $-32, %rsp
+; BURR-NEXT:    subq $96, %rsp
 ; BURR-NEXT:    movq %rdi, %rax
 ; BURR-NEXT:    xorps %xmm0, %xmm0
-; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, (%rsp)
+; BURR-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; BURR-NEXT:    leal (%rsi,%rsi), %ecx
 ; BURR-NEXT:    addb $3, %cl
 ; BURR-NEXT:    movl %ecx, %edx
 ; BURR-NEXT:    shrb $3, %dl
 ; BURR-NEXT:    andb $24, %dl
 ; BURR-NEXT:    negb %dl
-; BURR-NEXT:    movsbq %dl, %rdx
-; BURR-NEXT:    movq -24(%rsp,%rdx), %rsi
-; BURR-NEXT:    movq -16(%rsp,%rdx), %rdi
-; BURR-NEXT:    shldq %cl, %rsi, %rdi
+; BURR-NEXT:    movsbq %dl, %rsi
+; BURR-NEXT:    movq 48(%rsp,%rsi), %rdx
+; BURR-NEXT:    movq 56(%rsp,%rsi), %rdi
+; BURR-NEXT:    shldq %cl, %rdx, %rdi
 ; BURR-NEXT:    movq %rdi, 24(%rax)
-; BURR-NEXT:    movq -40(%rsp,%rdx), %rdi
-; BURR-NEXT:    movq -32(%rsp,%rdx), %rdx
-; BURR-NEXT:    movq %rdx, %r8
+; BURR-NEXT:    movq 32(%rsp,%rsi), %rdi
+; BURR-NEXT:    movq 40(%rsp,%rsi), %rsi
+; BURR-NEXT:    movq %rsi, %r8
 ; BURR-NEXT:    shldq %cl, %rdi, %r8
 ; BURR-NEXT:    movq %r8, 8(%rax)
 ; BURR-NEXT:    shlq %cl, %rdi
 ; BURR-NEXT:    movq %rdi, (%rax)
-; BURR-NEXT:    shlq %cl, %rsi
+; BURR-NEXT:    shlq %cl, %rdx
 ; BURR-NEXT:    notb %cl
-; BURR-NEXT:    shrq %rdx
+; BURR-NEXT:    shrq %rsi
 ; BURR-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BURR-NEXT:    shrq %cl, %rdx
-; BURR-NEXT:    orq %rsi, %rdx
-; BURR-NEXT:    movq %rdx, 16(%rax)
+; BURR-NEXT:    shrq %cl, %rsi
+; BURR-NEXT:    orq %rdx, %rsi
+; BURR-NEXT:    movq %rsi, 16(%rax)
+; BURR-NEXT:    movq %rbp, %rsp
+; BURR-NEXT:    popq %rbp
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
 ; SRC:       # %bb.0:
+; SRC-NEXT:    pushq %rbp
+; SRC-NEXT:    movq %rsp, %rbp
+; SRC-NEXT:    andq $-32, %rsp
+; SRC-NEXT:    subq $96, %rsp
 ; SRC-NEXT:    movq %rdi, %rax
 ; SRC-NEXT:    leal (%rsi,%rsi), %edx
 ; SRC-NEXT:    addb $3, %dl
 ; SRC-NEXT:    xorps %xmm0, %xmm0
-; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, (%rsp)
+; SRC-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shrb $3, %cl
 ; SRC-NEXT:    andb $24, %cl
 ; SRC-NEXT:    negb %cl
 ; SRC-NEXT:    movsbq %cl, %rsi
-; SRC-NEXT:    movq -24(%rsp,%rsi), %rdi
+; SRC-NEXT:    movq 48(%rsp,%rsi), %rdi
 ; SRC-NEXT:    movq %rdi, %r8
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shlq %cl, %r8
 ; SRC-NEXT:    notb %cl
-; SRC-NEXT:    movq -40(%rsp,%rsi), %r9
-; SRC-NEXT:    movq -32(%rsp,%rsi), %r10
+; SRC-NEXT:    movq 32(%rsp,%rsi), %r9
+; SRC-NEXT:    movq 40(%rsp,%rsi), %r10
 ; SRC-NEXT:    movq %r10, %r11
 ; SRC-NEXT:    shrq %r11
 ; SRC-NEXT:    shrq %cl, %r11
 ; SRC-NEXT:    orq %r8, %r11
-; SRC-NEXT:    movq -16(%rsp,%rsi), %rsi
+; SRC-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shldq %cl, %rdi, %rsi
 ; SRC-NEXT:    movq %r9, %rdi
@@ -156,10 +178,16 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    movq %r10, 8(%rax)
 ; SRC-NEXT:    movq %rdi, (%rax)
 ; SRC-NEXT:    movq %r11, 16(%rax)
+; SRC-NEXT:    movq %rbp, %rsp
+; SRC-NEXT:    popq %rbp
 ; SRC-NEXT:    retq
 ;
 ; LIN-LABEL: test1:
 ; LIN:       # %bb.0:
+; LIN-NEXT:    pushq %rbp
+; LIN-NEXT:    movq %rsp, %rbp
+; LIN-NEXT:    andq $-32, %rsp
+; LIN-NEXT:    subq $96, %rsp
 ; LIN-NEXT:    movq %rdi, %rax
 ; LIN-NEXT:    leal (%rsi,%rsi), %edx
 ; LIN-NEXT:    addb $3, %dl
@@ -169,21 +197,21 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    negb %cl
 ; LIN-NEXT:    movsbq %cl, %rsi
 ; LIN-NEXT:    xorps %xmm0, %xmm0
-; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq -40(%rsp,%rsi), %rdi
+; LIN-NEXT:    movaps %xmm0, (%rsp)
+; LIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; LIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq 32(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r8
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shlq %cl, %r8
 ; LIN-NEXT:    movq %r8, (%rax)
-; LIN-NEXT:    movq -32(%rsp,%rsi), %r8
+; LIN-NEXT:    movq 40(%rsp,%rsi), %r8
 ; LIN-NEXT:    movq %r8, %r9
 ; LIN-NEXT:    shldq %cl, %rdi, %r9
 ; LIN-NEXT:    movq %r9, 8(%rax)
-; LIN-NEXT:    movq -24(%rsp,%rsi), %rdi
+; LIN-NEXT:    movq 48(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r9
 ; LIN-NEXT:    shlq %cl, %r9
 ; LIN-NEXT:    shrq %r8
@@ -191,10 +219,12 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    shrq %cl, %r8
 ; LIN-NEXT:    orq %r9, %r8
 ; LIN-NEXT:    movq %r8, 16(%rax)
-; LIN-NEXT:    movq -16(%rsp,%rsi), %rsi
+; LIN-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shldq %cl, %rdi, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)
+; LIN-NEXT:    movq %rbp, %rsp
+; LIN-NEXT:    popq %rbp
 ; LIN-NEXT:    retq
   %b = add i256 %a, 1
   %m = shl i256 %b, 1
diff --git a/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll b/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll
new file mode 100644
index 0000000000000..a053175ae4c95
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll
@@ -0,0 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -O2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -O2 | FileCheck %s --check-prefix=X64
+;
+; Test i256 shifts with narrow (i32) shift amounts. These use the
+; ExpandShiftWithUnknownAmountBit path (parts expansion) rather than the
+; shift-through-stack approach, because ExpandShiftWithKnownAmountBit
+; cannot determine the high bit of the shift amount.
+
+define void @shl_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: shl_i256_by_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    movzbl 40(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 68(%esp,%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%esp,%edi), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 84(%esp,%edi), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl 88(%esp,%edi), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl 64(%esp,%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%edi), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl %edx, 28(%edi)
+; X86-NEXT:    movl %eax, 24(%edi)
+; X86-NEXT:    movl %ebx, 20(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    movl %eax, (%edi)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl_i256_by_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $96, %rsp
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, (%rsp)
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %rax
+; X64-NEXT:    movq 32(%rsp,%rax), %rdx
+; X64-NEXT:    movq 40(%rsp,%rax), %rsi
+; X64-NEXT:    movq 48(%rsp,%rax), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %r8d, %ecx
+; X64-NEXT:    shldq %cl, %rsi, %r10
+; X64-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-NEXT:    shldq %cl, %rdi, %rax
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    shlq %cl, %rdi
+; X64-NEXT:    shldq %cl, %rdx, %rsi
+; X64-NEXT:    movq %rax, 24(%r9)
+; X64-NEXT:    movq %r10, 16(%r9)
+; X64-NEXT:    movq %rsi, 8(%r9)
+; X64-NEXT:    movq %rdi, (%r9)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+entry:
+  %amt256 = zext i32 %amt to i256
+  %res = shl i256 %x, %amt256
+  store i256 %res, ptr %r
+  ret void
+}
+
+define void @lshr_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: lshr_i256_by_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    movzbl 40(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 40(%esp,%eax,4), %edx
+; X86-NEXT:    movl 36(%esp,%eax,4), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esp,%eax,4), %esi
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esp,%eax,4), %esi
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl 56(%esp,%eax,4), %edx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl 32(%esp,%eax,4), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%esp,%eax,4), %eax
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %eax, 28(%ecx)
+; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    movl %esi, 20(%ecx)
+; X86-NEXT:    movl %ebx, 16(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: lshr_i256_by_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $96, %rsp
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, (%rsp)
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movq 16(%rsp,%rax,8), %rdx
+; X64-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-NEXT:    movq 8(%rsp,%rax,8), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %r8d, %ecx
+; X64-NEXT:    shrdq %cl, %rdx, %r10
+; X64-NEXT:    movq 24(%rsp,%rax,8), %rax
+; X64-NEXT:    shrdq %cl, %rax, %rdx
+; X64-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-NEXT:    shrq %cl, %rax
+; X64-NEXT:    movq %rax, 24(%r9)
+; X64-NEXT:    movq %rdx, 16(%r9)
+; X64-NEXT:    movq %r10, 8(%r9)
+; X64-NEXT:    movq %rsi, (%r9)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+entry:
+  %amt256 = zext i32 %amt to i256
+  %res = lshr i256 %x, %amt256
+  store i256 %res, ptr %r
+  ret void
+}
+
+define void @ashr_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: ashr_i256_by_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    movzbl 40(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 40(%esp,%eax,4), %edx
+; X86-NEXT:    movl 36(%esp,%eax,4), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esp,%eax,4), %esi
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esp,%eax,4), %esi
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl 56(%esp,%eax,4), %edx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl 32(%esp,%eax,4), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%esp,%eax,4), %eax
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sarl %cl, %eax
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %eax, 28(%ecx)
+; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    movl %esi, 20(%ecx)
+; X86-NEXT:    movl %ebx, 16(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: ashr_i256_by_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $96, %rsp
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, (%rsp)
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movq 16(%rsp,%rax,8), %rdx
+; X64-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-NEXT:    movq 8(%rsp,%rax,8), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %r8d, %ecx
+; X64-NEXT:    shrdq %cl, %rdx, %r10
+; X64-NEXT:    movq 24(%rsp,%rax,8), %rax
+; X64-NEXT:    shrdq %cl, %rax, %rdx
+; X64-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-NEXT:    sarq %cl, %rax
+; X64-NEXT:    movq %rax, 24(%r9)
+; X64-NEXT:    movq %rdx, 16(%r9)
+; X64-NEXT:    movq %r10, 8(%r9)
+; X64-NEXT:    movq %rsi, (%r9)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+entry:
+  %amt256 = zext i32 %amt to i256
+  %res = ashr i256 %x, %amt256
+  store i256 %res, ptr %r
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 128e2199fb56f..0663bec48899a 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -88,31 +88,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ;
 ; CHECK-X64-O0-LABEL: shift1:
 ; CHECK-X64-O0:       # %bb.0: # %entry
-; CHECK-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    pushq %rbp
+; CHECK-X64-O0-NEXT:    movq %rsp, %rbp
+; CHECK-X64-O0-NEXT:    andq $-32, %rsp
+; CHECK-X64-O0-NEXT:    subq $128, %rsp
+; CHECK-X64-O0-NEXT:    movq 24(%rbp), %rax
+; CHECK-X64-O0-NEXT:    movq 16(%rbp), %rax
+; CHECK-X64-O0-NEXT:    movq 32(%rbp), %rax
+; CHECK-X64-O0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    sarq $63, %rcx
-; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movb %r8b, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-X64-O0-NEXT:    movb %cl, %dl
 ; CHECK-X64-O0-NEXT:    shrb $6, %dl
 ; CHECK-X64-O0-NEXT:    movzbl %dl, %edx
 ; CHECK-X64-O0-NEXT:    movl %edx, %edi
-; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi,8), %rsi
-; CHECK-X64-O0-NEXT:    movq -72(%rsp,%rdi,8), %r8
-; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT:    movq 48(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT:    movq 32(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT:    movq 40(%rsp,%rdi,8), %r9
 ; CHECK-X64-O0-NEXT:    movq %r9, %rdx
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rsi, %rdx
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi,8), %rdi
+; CHECK-X64-O0-NEXT:    movq 56(%rsp,%rdi,8), %rdi
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rdi, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %r9, %r8
@@ -124,30 +128,36 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rax)
 ; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rax)
 ; CHECK-X64-O0-NEXT:    movq %rcx, (%rax)
+; CHECK-X64-O0-NEXT:    movq %rbp, %rsp
+; CHECK-X64-O0-NEXT:    popq %rbp
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift1:
 ; CHECK-X64-O2:       # %bb.0: # %entry
-; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    pushq %rbp
+; CHECK-X64-O2-NEXT:    movq %rsp, %rbp
+; CHECK-X64-O2-NEXT:    andq $-32, %rsp
+; CHECK-X64-O2-NEXT:    subq $96, %rsp
+; CHECK-X64-O2-NEXT:    movq 32(%rbp), %rax
+; CHECK-X64-O2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rdi, (%rsp)
 ; CHECK-X64-O2-NEXT:    sarq $63, %rcx
-; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
 ; CHECK-X64-O2-NEXT:    shrb $6, %cl
 ; CHECK-X64-O2-NEXT:    movzbl %cl, %edx
-; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx,8), %rsi
-; CHECK-X64-O2-NEXT:    movq -72(%rsp,%rdx,8), %rdi
-; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT:    movq 16(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT:    movq (%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT:    movq 8(%rsp,%rdx,8), %r9
 ; CHECK-X64-O2-NEXT:    movq %r9, %r10
 ; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
 ; CHECK-X64-O2-NEXT:    shrdq %cl, %rsi, %r10
-; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT:    movq 24(%rsp,%rdx,8), %rdx
 ; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %rsi
 ; CHECK-X64-O2-NEXT:    shrdq %cl, %r9, %rdi
 ; CHECK-X64-O2-NEXT:    sarq %cl, %rdx
@@ -155,6 +165,8 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O2-NEXT:    movq %rsi, 16(%rax)
 ; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
 ; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
+; CHECK-X64-O2-NEXT:    movq %rbp, %rsp
+; CHECK-X64-O2-NEXT:    popq %rbp
 ; CHECK-X64-O2-NEXT:    retq
 entry:
 	%0 = ashr i256 %x, %a
@@ -245,15 +257,19 @@ define i256 @shift2(i256 %c) nounwind
 ;
 ; CHECK-X64-O0-LABEL: shift2:
 ; CHECK-X64-O0:       # %bb.0:
+; CHECK-X64-O0-NEXT:    pushq %rbp
+; CHECK-X64-O0-NEXT:    movq %rsp, %rbp
+; CHECK-X64-O0-NEXT:    andq $-32, %rsp
+; CHECK-X64-O0-NEXT:    subq $128, %rsp
 ; CHECK-X64-O0-NEXT:    movq %rdi, %rax
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movb %sil, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-X64-O0-NEXT:    movb %cl, %dl
@@ -261,13 +277,13 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O0-NEXT:    andb $24, %dl
 ; CHECK-X64-O0-NEXT:    negb %dl
 ; CHECK-X64-O0-NEXT:    movsbq %dl, %r8
-; CHECK-X64-O0-NEXT:    movq -40(%rsp,%r8), %r9
-; CHECK-X64-O0-NEXT:    movq -32(%rsp,%r8), %rdx
-; CHECK-X64-O0-NEXT:    movq -24(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT:    movq 64(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT:    movq 72(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT:    movq 80(%rsp,%r8), %r10
 ; CHECK-X64-O0-NEXT:    movq %r10, %rsi
 ; CHECK-X64-O0-NEXT:    shldq %cl, %rdx, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq -16(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT:    movq 88(%rsp,%r8), %r8
 ; CHECK-X64-O0-NEXT:    shldq %cl, %r10, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-X64-O0-NEXT:    movq %r9, %r10
@@ -280,29 +296,35 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rdi)
 ; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rdi)
 ; CHECK-X64-O0-NEXT:    movq %rcx, (%rdi)
+; CHECK-X64-O0-NEXT:    movq %rbp, %rsp
+; CHECK-X64-O0-NEXT:    popq %rbp
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift2:
 ; CHECK-X64-O2:       # %bb.0:
+; CHECK-X64-O2-NEXT:    pushq %rbp
+; CHECK-X64-O2-NEXT:    movq %rsp, %rbp
+; CHECK-X64-O2-NEXT:    andq $-32, %rsp
+; CHECK-X64-O2-NEXT:    subq $96, %rsp
 ; CHECK-X64-O2-NEXT:    movq %rsi, %rcx
 ; CHECK-X64-O2-NEXT:    movq %rdi, %rax
 ; CHECK-X64-O2-NEXT:    xorps %xmm0, %xmm0
-; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, (%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movl %ecx, %edx
 ; CHECK-X64-O2-NEXT:    shrb $3, %dl
 ; CHECK-X64-O2-NEXT:    andb $24, %dl
 ; CHECK-X64-O2-NEXT:    negb %dl
 ; CHECK-X64-O2-NEXT:    movsbq %dl, %rdx
-; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT:    movq 32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT:    movq 40(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT:    movq 48(%rsp,%rdx), %r8
 ; CHECK-X64-O2-NEXT:    movq %r8, %r9
 ; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r9
-; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT:    movq 56(%rsp,%rdx), %rdx
 ; CHECK-X64-O2-NEXT:    shldq %cl, %r8, %rdx
 ; CHECK-X64-O2-NEXT:    movq %rsi, %r8
 ; CHECK-X64-O2-NEXT:    shlq %cl, %r8
@@ -312,6 +334,8 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O2-NEXT:    movq %r9, 16(%rax)
 ; CHECK-X64-O2-NEXT:    movq %rdi, 8(%rax)
 ; CHECK-X64-O2-NEXT:    movq %r8, (%rax)
+; CHECK-X64-O2-NEXT:    movq %rbp, %rsp
+; CHECK-X64-O2-NEXT:    popq %rbp
 ; CHECK-X64-O2-NEXT:    retq
 {
   %b = shl i256 1, %c  ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..9e3b9ca717df0 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -9,111 +9,118 @@
 define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; SSE-LABEL: shl_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
+; SSE-NEXT:    movq 40(%rbp), %rax
+; SSE-NEXT:    movaps 16(%rbp), %xmm0
+; SSE-NEXT:    movq 32(%rbp), %r10
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %eax
 ; SSE-NEXT:    andl $56, %eax
 ; SSE-NEXT:    negl %eax
 ; SSE-NEXT:    cltq
-; SSE-NEXT:    movq -56(%rsp,%rax), %rdx
-; SSE-NEXT:    movq -48(%rsp,%rax), %r9
+; SSE-NEXT:    movq 72(%rsp,%rax), %rdx
+; SSE-NEXT:    movq 80(%rsp,%rax), %r9
 ; SSE-NEXT:    movq %r9, %rsi
 ; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    movq -40(%rsp,%rax), %r10
+; SSE-NEXT:    movq 88(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %r8
 ; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -32(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r11
-; SSE-NEXT:    shldq %cl, %r10, %r11
-; SSE-NEXT:    movq -24(%rsp,%rax), %r10
+; SSE-NEXT:    movq 96(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r9
+; SSE-NEXT:    shldq %cl, %r10, %r9
+; SSE-NEXT:    movq 104(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    movq -16(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r14
+; SSE-NEXT:    shldq %cl, %r11, %rbx
+; SSE-NEXT:    movq 112(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r14
 ; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq -8(%rsp,%rax), %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -64(%rsp,%rax), %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    movq 120(%rsp,%rax), %r10
+; SSE-NEXT:    shldq %cl, %r11, %r10
+; SSE-NEXT:    movq 64(%rsp,%rax), %rax
+; SSE-NEXT:    movq %rax, %r11
+; SSE-NEXT:    shlq %cl, %r11
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; SSE-NEXT:    shldq %cl, %rax, %rdx
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movq %r10, 56(%rdi)
 ; SSE-NEXT:    movq %r14, 48(%rdi)
 ; SSE-NEXT:    movq %rbx, 40(%rdi)
-; SSE-NEXT:    movq %r11, 32(%rdi)
+; SSE-NEXT:    movq %r9, 32(%rdi)
 ; SSE-NEXT:    movq %r8, 24(%rdi)
 ; SSE-NEXT:    movq %rsi, 16(%rdi)
 ; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    movq %r9, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    movq %r11, (%rdi)
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: shl_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    movq 40(%rbp), %rax
+; AVX2-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX2-NEXT:    movq 32(%rbp), %r10
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX2-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %eax
 ; AVX2-NEXT:    andl $56, %eax
 ; AVX2-NEXT:    negl %eax
 ; AVX2-NEXT:    movslq %eax, %r8
-; AVX2-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %rsi
 ; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %r9
 ; AVX2-NEXT:    shldq %cl, %rax, %r9
-; AVX2-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r11
 ; AVX2-NEXT:    shldq %cl, %r10, %r11
-; AVX2-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %rbx
 ; AVX2-NEXT:    shldq %cl, %rax, %rbx
-; AVX2-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r14
 ; AVX2-NEXT:    shldq %cl, %r10, %r14
-; AVX2-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX2-NEXT:    shldq %cl, %rax, %r10
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX2-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX2-NEXT:    shldq %cl, %rdi, %rdx
@@ -125,55 +132,59 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rsi, 16(%rax)
 ; AVX2-NEXT:    movq %rdx, 8(%rax)
 ; AVX2-NEXT:    movq %r8, (%rax)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shl_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
+; AVX512F-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT:    movq 32(%rbp), %rax
+; AVX512F-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq 40(%rbp), %rax
+; AVX512F-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
+; AVX512F-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
 ; AVX512F-NEXT:    negl %eax
 ; AVX512F-NEXT:    movslq %eax, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %rsi
 ; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %r9
 ; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r11
 ; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %rbx
 ; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r14
 ; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX512F-NEXT:    shldq %cl, %rax, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
@@ -185,54 +196,59 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rsi, 16(%rax)
 ; AVX512F-NEXT:    movq %rdx, 8(%rax)
 ; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    movq 40(%rbp), %rax
+; AVX512VL-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT:    movq 32(%rbp), %r10
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %eax
 ; AVX512VL-NEXT:    andl $56, %eax
 ; AVX512VL-NEXT:    negl %eax
 ; AVX512VL-NEXT:    movslq %eax, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VL-NEXT:    movq %rax, %rsi
 ; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VL-NEXT:    movq %r10, %r8
 ; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VL-NEXT:    movq %r11, %rbx
 ; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    movq %rdi, %r10
 ; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VL-NEXT:    movq %r14, %r15
 ; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -245,55 +261,61 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VL-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VL-NEXT:    movq %r9, (%rax)
+; AVX512VL-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    movq 40(%rbp), %rax
+; AVX512VBMI-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT:    movq 32(%rbp), %r10
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %eax, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %eax
 ; AVX512VBMI-NEXT:    andl $56, %eax
 ; AVX512VBMI-NEXT:    negl %eax
 ; AVX512VBMI-NEXT:    movslq %eax, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %rsi
 ; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VBMI-NEXT:    movq %r10, %r8
 ; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VBMI-NEXT:    movq %r11, %rbx
 ; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    movq %rdi, %r10
 ; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VBMI-NEXT:    movq %r14, %r15
 ; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -306,9 +328,11 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = shl i512 %a0, %a1
@@ -318,43 +342,47 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; SSE-LABEL: lshr_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
+; SSE-NEXT:    movq 40(%rbp), %rax
+; SSE-NEXT:    movaps 16(%rbp), %xmm0
+; SSE-NEXT:    movq 32(%rbp), %r10
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, (%rsp)
 ; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %eax
 ; SSE-NEXT:    andl $56, %eax
-; SSE-NEXT:    movq -112(%rsp,%rax), %rdx
-; SSE-NEXT:    movq -120(%rsp,%rax), %r9
+; SSE-NEXT:    movq 16(%rsp,%rax), %rdx
+; SSE-NEXT:    movq 8(%rsp,%rax), %r9
 ; SSE-NEXT:    movq %r9, %rsi
 ; SSE-NEXT:    shrdq %cl, %rdx, %rsi
-; SSE-NEXT:    movq -104(%rsp,%rax), %r8
+; SSE-NEXT:    movq 24(%rsp,%rax), %r8
 ; SSE-NEXT:    shrdq %cl, %r8, %rdx
-; SSE-NEXT:    movq -96(%rsp,%rax), %r10
+; SSE-NEXT:    movq 32(%rsp,%rax), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r8
-; SSE-NEXT:    movq -88(%rsp,%rax), %r11
+; SSE-NEXT:    movq 40(%rsp,%rax), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rax), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rax), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rax), %r14
+; SSE-NEXT:    movq 56(%rsp,%rax), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rax), %r15
+; SSE-NEXT:    movq (%rsp,%rax), %r15
 ; SSE-NEXT:    shrdq %cl, %r9, %r15
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -367,47 +395,53 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; SSE-NEXT:    movq %rdx, 16(%rdi)
 ; SSE-NEXT:    movq %rsi, 8(%rdi)
 ; SSE-NEXT:    movq %r15, (%rdi)
+; SSE-NEXT:    leaq -24(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    movq 40(%rbp), %rax
+; AVX2-NEXT:    vmovaps 16(%rbp), %xmm0
 ; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq 32(%rbp), %r10
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, (%rsp)
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %eax
 ; AVX2-NEXT:    andl $56, %eax
-; AVX2-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX2-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX2-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX2-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX2-NEXT:    movq %r9, %rsi
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX2-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX2-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX2-NEXT:    shrdq %cl, %r8, %rdx
-; AVX2-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r8
-; AVX2-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX2-NEXT:    movq (%rsp,%rax), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rax), %r15
 ; AVX2-NEXT:    shrdq %cl, %r15, %rbx
 ; AVX2-NEXT:    shrdq %cl, %r9, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -420,47 +454,53 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rdx, 16(%rdi)
 ; AVX2-NEXT:    movq %rsi, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    leaq -24(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
+; AVX512F-NEXT:    movq 40(%rbp), %rax
+; AVX512F-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT:    movq 32(%rbp), %r10
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, (%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512F-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512F-NEXT:    movq %r9, %rsi
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r8
-; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT:    movq (%rsp,%rax), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rax), %r15
 ; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %r9, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -473,48 +513,54 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512F-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT:    movq 40(%rbp), %rax
 ; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq 32(%rbp), %r10
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, (%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %eax
 ; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512VL-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512VL-NEXT:    movq %r9, %rsi
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rax), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT:    movq (%rsp,%rax), %r15
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
@@ -526,49 +572,55 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512VL-NEXT:    movq %r15, (%rdi)
+; AVX512VL-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT:    movq 40(%rbp), %rax
 ; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq 32(%rbp), %r10
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, (%rsp)
 ; AVX512VBMI-NEXT:    movl %eax, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %eax
 ; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512VBMI-NEXT:    movq %r9, %rsi
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rax), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT:    movq (%rsp,%rax), %r15
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
@@ -580,9 +632,11 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %r15, (%rdi)
+; AVX512VBMI-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = lshr i512 %a0, %a1
@@ -592,47 +646,51 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; SSE-LABEL: ashr_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
+; SSE-NEXT:    movq 40(%rbp), %rax
+; SSE-NEXT:    movaps 16(%rbp), %xmm0
+; SSE-NEXT:    movq 32(%rbp), %r10
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, (%rsp)
 ; SSE-NEXT:    sarq $63, %r10
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %eax
 ; SSE-NEXT:    andl $56, %eax
-; SSE-NEXT:    movq -112(%rsp,%rax), %rdx
-; SSE-NEXT:    movq -120(%rsp,%rax), %r9
+; SSE-NEXT:    movq 16(%rsp,%rax), %rdx
+; SSE-NEXT:    movq 8(%rsp,%rax), %r9
 ; SSE-NEXT:    movq %r9, %rsi
 ; SSE-NEXT:    shrdq %cl, %rdx, %rsi
-; SSE-NEXT:    movq -104(%rsp,%rax), %r8
+; SSE-NEXT:    movq 24(%rsp,%rax), %r8
 ; SSE-NEXT:    shrdq %cl, %r8, %rdx
-; SSE-NEXT:    movq -96(%rsp,%rax), %r10
+; SSE-NEXT:    movq 32(%rsp,%rax), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r8
-; SSE-NEXT:    movq -88(%rsp,%rax), %r11
+; SSE-NEXT:    movq 40(%rsp,%rax), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rax), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rax), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rax), %r14
+; SSE-NEXT:    movq 56(%rsp,%rax), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rax), %r15
+; SSE-NEXT:    movq (%rsp,%rax), %r15
 ; SSE-NEXT:    shrdq %cl, %r9, %r15
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -645,53 +703,59 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; SSE-NEXT:    movq %rdx, 16(%rdi)
 ; SSE-NEXT:    movq %rsi, 8(%rdi)
 ; SSE-NEXT:    movq %r15, (%rdi)
+; SSE-NEXT:    leaq -24(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX2-NEXT:    movq 32(%rbp), %r10
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq 40(%rbp), %rax
+; AVX2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, (%rsp)
 ; AVX2-NEXT:    sarq $63, %r10
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %eax
 ; AVX2-NEXT:    andl $56, %eax
-; AVX2-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX2-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX2-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX2-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX2-NEXT:    movq %r9, %rsi
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX2-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX2-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX2-NEXT:    shrdq %cl, %r8, %rdx
-; AVX2-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r8
-; AVX2-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX2-NEXT:    movq (%rsp,%rax), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rax), %r15
 ; AVX2-NEXT:    shrdq %cl, %r15, %rbx
 ; AVX2-NEXT:    shrdq %cl, %r9, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -704,53 +768,59 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rdx, 16(%rdi)
 ; AVX2-NEXT:    movq %rsi, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    leaq -24(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ashr_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
+; AVX512F-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT:    movq 32(%rbp), %r10
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq 40(%rbp), %rax
+; AVX512F-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, (%rsp)
 ; AVX512F-NEXT:    sarq $63, %r10
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512F-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512F-NEXT:    movq %r9, %rsi
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r8
-; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT:    movq (%rsp,%rax), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rax), %r15
 ; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %r9, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -763,54 +833,60 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512F-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT:    movq 32(%rbp), %r10
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq 40(%rbp), %rax
+; AVX512VL-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, (%rsp)
 ; AVX512VL-NEXT:    sarq $63, %r10
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %eax
 ; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512VL-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512VL-NEXT:    movq %r9, %rsi
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rax), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT:    movq (%rsp,%rax), %r15
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
@@ -822,54 +898,60 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512VL-NEXT:    movq %r15, (%rdi)
+; AVX512VL-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT:    movq 32(%rbp), %r10
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq 40(%rbp), %rax
+; AVX512VBMI-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, (%rsp)
 ; AVX512VBMI-NEXT:    sarq $63, %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %eax, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %eax
 ; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rax), %r9
 ; AVX512VBMI-NEXT:    movq %r9, %rsi
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rax), %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rax), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rax), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rax), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rax), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT:    movq (%rsp,%rax), %r15
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
@@ -881,9 +963,11 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %r15, (%rdi)
+; AVX512VBMI-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    retq
   %r = ashr i512 %a0, %a1
   ret i512 %r
@@ -892,103 +976,110 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: shl_i512_load:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    movaps (%rsi), %xmm0
 ; SSE-NEXT:    movaps 16(%rsi), %xmm1
 ; SSE-NEXT:    movaps 32(%rsi), %xmm2
 ; SSE-NEXT:    movaps 48(%rsi), %xmm3
 ; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, (%rsp)
+; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %edx, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %edx
 ; SSE-NEXT:    andl $56, %edx
 ; SSE-NEXT:    negl %edx
 ; SSE-NEXT:    movslq %edx, %rax
-; SSE-NEXT:    movq -56(%rsp,%rax), %rdx
-; SSE-NEXT:    movq -48(%rsp,%rax), %r9
+; SSE-NEXT:    movq 72(%rsp,%rax), %rdx
+; SSE-NEXT:    movq 80(%rsp,%rax), %r9
 ; SSE-NEXT:    movq %r9, %rsi
 ; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    movq -40(%rsp,%rax), %r10
+; SSE-NEXT:    movq 88(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %r8
 ; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -32(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r11
-; SSE-NEXT:    shldq %cl, %r10, %r11
-; SSE-NEXT:    movq -24(%rsp,%rax), %r10
+; SSE-NEXT:    movq 96(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r9
+; SSE-NEXT:    shldq %cl, %r10, %r9
+; SSE-NEXT:    movq 104(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    movq -16(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r14
+; SSE-NEXT:    shldq %cl, %r11, %rbx
+; SSE-NEXT:    movq 112(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r14
 ; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq -8(%rsp,%rax), %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -64(%rsp,%rax), %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    movq 120(%rsp,%rax), %r10
+; SSE-NEXT:    shldq %cl, %r11, %r10
+; SSE-NEXT:    movq 64(%rsp,%rax), %rax
+; SSE-NEXT:    movq %rax, %r11
+; SSE-NEXT:    shlq %cl, %r11
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; SSE-NEXT:    shldq %cl, %rax, %rdx
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movq %r10, 56(%rdi)
 ; SSE-NEXT:    movq %r14, 48(%rdi)
 ; SSE-NEXT:    movq %rbx, 40(%rdi)
-; SSE-NEXT:    movq %r11, 32(%rdi)
+; SSE-NEXT:    movq %r9, 32(%rdi)
 ; SSE-NEXT:    movq %r8, 24(%rdi)
 ; SSE-NEXT:    movq %rsi, 16(%rdi)
 ; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    movq %r9, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    movq %r11, (%rdi)
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: shl_i512_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rsi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    vmovaps (%rsi), %ymm0
+; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %edx, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %edx
 ; AVX2-NEXT:    andl $56, %edx
 ; AVX2-NEXT:    negl %edx
 ; AVX2-NEXT:    movslq %edx, %r8
-; AVX2-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %rsi
 ; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %r9
 ; AVX2-NEXT:    shldq %cl, %rax, %r9
-; AVX2-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r11
 ; AVX2-NEXT:    shldq %cl, %r10, %r11
-; AVX2-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %rbx
 ; AVX2-NEXT:    shldq %cl, %rax, %rbx
-; AVX2-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r14
 ; AVX2-NEXT:    shldq %cl, %r10, %r14
-; AVX2-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX2-NEXT:    shldq %cl, %rax, %r10
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX2-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX2-NEXT:    shldq %cl, %rdi, %rdx
@@ -1000,47 +1091,51 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rsi, 16(%rax)
 ; AVX2-NEXT:    movq %rdx, 8(%rax)
 ; AVX2-NEXT:    movq %r8, (%rax)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shl_i512_load:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
 ; AVX512F-NEXT:    vmovups (%rsi), %zmm0
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, (%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %edx, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %edx
 ; AVX512F-NEXT:    andl $56, %edx
 ; AVX512F-NEXT:    negl %edx
 ; AVX512F-NEXT:    movslq %edx, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %rsi
 ; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %r9
 ; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r11
 ; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %rbx
 ; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r14
 ; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX512F-NEXT:    shldq %cl, %rax, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
@@ -1052,48 +1147,53 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rsi, 16(%rax)
 ; AVX512F-NEXT:    movq %rdx, 8(%rax)
 ; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_i512_load:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps (%rsi), %ymm0
+; AVX512VL-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %edx, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %edx
 ; AVX512VL-NEXT:    andl $56, %edx
 ; AVX512VL-NEXT:    negl %edx
 ; AVX512VL-NEXT:    movslq %edx, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VL-NEXT:    movq %rax, %rsi
 ; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VL-NEXT:    movq %r10, %r8
 ; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VL-NEXT:    movq %r11, %rbx
 ; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    movq %rdi, %r10
 ; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VL-NEXT:    movq %r14, %r15
 ; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -1106,49 +1206,55 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VL-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VL-NEXT:    movq %r9, (%rax)
+; AVX512VL-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_i512_load:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rsi), %ymm0
+; AVX512VBMI-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %edx, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %edx
 ; AVX512VBMI-NEXT:    andl $56, %edx
 ; AVX512VBMI-NEXT:    negl %edx
 ; AVX512VBMI-NEXT:    movslq %edx, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %rsi
 ; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VBMI-NEXT:    movq %r10, %r8
 ; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VBMI-NEXT:    movq %r11, %rbx
 ; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    movq %rdi, %r10
 ; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VBMI-NEXT:    movq %r14, %r15
 ; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -1161,9 +1267,11 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -1174,41 +1282,44 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: lshr_i512_load:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    movaps (%rsi), %xmm0
 ; SSE-NEXT:    movaps 16(%rsi), %xmm1
 ; SSE-NEXT:    movaps 32(%rsi), %xmm2
 ; SSE-NEXT:    movaps 48(%rsi), %xmm3
 ; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movl %edx, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %edx
 ; SSE-NEXT:    andl $56, %edx
-; SSE-NEXT:    movq -112(%rsp,%rdx), %rsi
-; SSE-NEXT:    movq -120(%rsp,%rdx), %rax
+; SSE-NEXT:    movq 16(%rsp,%rdx), %rsi
+; SSE-NEXT:    movq 8(%rsp,%rdx), %rax
 ; SSE-NEXT:    movq %rax, %r8
 ; SSE-NEXT:    shrdq %cl, %rsi, %r8
-; SSE-NEXT:    movq -104(%rsp,%rdx), %r9
+; SSE-NEXT:    movq 24(%rsp,%rdx), %r9
 ; SSE-NEXT:    shrdq %cl, %r9, %rsi
-; SSE-NEXT:    movq -96(%rsp,%rdx), %r10
+; SSE-NEXT:    movq 32(%rsp,%rdx), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r9
-; SSE-NEXT:    movq -88(%rsp,%rdx), %r11
+; SSE-NEXT:    movq 40(%rsp,%rdx), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rdx), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rdx), %r14
+; SSE-NEXT:    movq 56(%rsp,%rdx), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rdx), %rdx
+; SSE-NEXT:    movq (%rsp,%rdx), %rdx
 ; SSE-NEXT:    shrdq %cl, %rax, %rdx
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -1221,41 +1332,45 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; SSE-NEXT:    movq %rsi, 16(%rdi)
 ; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_i512_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rsi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    vmovaps (%rsi), %ymm0
+; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movl %edx, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %edx
 ; AVX2-NEXT:    andl $56, %edx
-; AVX2-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX2-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX2-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX2-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX2-NEXT:    movq %rax, %r8
 ; AVX2-NEXT:    shrdq %cl, %rsi, %r8
-; AVX2-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX2-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX2-NEXT:    shrdq %cl, %r9, %rsi
-; AVX2-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r9
-; AVX2-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rdx), %rdx
+; AVX2-NEXT:    movq (%rsp,%rdx), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rdx), %rdx
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rbx
 ; AVX2-NEXT:    shrdq %cl, %rax, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -1268,39 +1383,43 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rsi, 16(%rdi)
 ; AVX2-NEXT:    movq %r8, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_i512_load:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
 ; AVX512F-NEXT:    vmovups (%rsi), %zmm0
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    movl %edx, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %edx
 ; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512F-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512F-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512F-NEXT:    movq %rax, %r8
 ; AVX512F-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512F-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512F-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    movq (%rsp,%rdx), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rdx), %rdx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rax, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -1313,42 +1432,46 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512F-NEXT:    movq %r8, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_i512_load:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $128, %rsp
+; AVX512VL-NEXT:    vmovaps (%rsi), %ymm0
+; AVX512VL-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    movl %edx, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %edx
 ; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512VL-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512VL-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512VL-NEXT:    movq %rax, %r8
 ; AVX512VL-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512VL-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rdx), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rdx), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    movq (%rsp,%rdx), %rdx
 ; AVX512VL-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
@@ -1360,43 +1483,47 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512VL-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VL-NEXT:    movq %rdx, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_i512_load:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $128, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rsi), %ymm0
+; AVX512VBMI-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    movl %edx, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %edx
 ; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rdx), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rdx), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    movq (%rsp,%rdx), %rdx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
@@ -1408,9 +1535,10 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %rdx, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -1421,47 +1549,50 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: ashr_i512_load:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    movaps (%rsi), %xmm0
 ; SSE-NEXT:    movaps 16(%rsi), %xmm1
 ; SSE-NEXT:    movaps 32(%rsi), %xmm2
 ; SSE-NEXT:    movq 48(%rsi), %rax
 ; SSE-NEXT:    movq 56(%rsi), %rcx
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    sarq $63, %rcx
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %edx, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %edx
 ; SSE-NEXT:    andl $56, %edx
-; SSE-NEXT:    movq -112(%rsp,%rdx), %rsi
-; SSE-NEXT:    movq -120(%rsp,%rdx), %rax
+; SSE-NEXT:    movq 16(%rsp,%rdx), %rsi
+; SSE-NEXT:    movq 8(%rsp,%rdx), %rax
 ; SSE-NEXT:    movq %rax, %r8
 ; SSE-NEXT:    shrdq %cl, %rsi, %r8
-; SSE-NEXT:    movq -104(%rsp,%rdx), %r9
+; SSE-NEXT:    movq 24(%rsp,%rdx), %r9
 ; SSE-NEXT:    shrdq %cl, %r9, %rsi
-; SSE-NEXT:    movq -96(%rsp,%rdx), %r10
+; SSE-NEXT:    movq 32(%rsp,%rdx), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r9
-; SSE-NEXT:    movq -88(%rsp,%rdx), %r11
+; SSE-NEXT:    movq 40(%rsp,%rdx), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rdx), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rdx), %r14
+; SSE-NEXT:    movq 56(%rsp,%rdx), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rdx), %rdx
+; SSE-NEXT:    movq (%rsp,%rdx), %rdx
 ; SSE-NEXT:    shrdq %cl, %rax, %rdx
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -1474,51 +1605,55 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; SSE-NEXT:    movq %rsi, 16(%rdi)
 ; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_i512_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rsi), %ymm0
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    vmovaps (%rsi), %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rsi), %xmm1
 ; AVX2-NEXT:    movq 48(%rsi), %rax
 ; AVX2-NEXT:    movq 56(%rsi), %rcx
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    sarq $63, %rcx
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %edx, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %edx
 ; AVX2-NEXT:    andl $56, %edx
-; AVX2-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX2-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX2-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX2-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX2-NEXT:    movq %rax, %r8
 ; AVX2-NEXT:    shrdq %cl, %rsi, %r8
-; AVX2-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX2-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX2-NEXT:    shrdq %cl, %r9, %rsi
-; AVX2-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r9
-; AVX2-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rdx), %rdx
+; AVX2-NEXT:    movq (%rsp,%rdx), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rdx), %rdx
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rbx
 ; AVX2-NEXT:    shrdq %cl, %rax, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -1531,52 +1666,56 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX2-NEXT:    movq %rsi, 16(%rdi)
 ; AVX2-NEXT:    movq %r8, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ashr_i512_load:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rsi), %ymm0
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
+; AVX512F-NEXT:    vmovaps (%rsi), %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rsi), %xmm1
 ; AVX512F-NEXT:    movq 48(%rsi), %rax
 ; AVX512F-NEXT:    movq 56(%rsi), %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %edx, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %edx
 ; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512F-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512F-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512F-NEXT:    movq %rax, %r8
 ; AVX512F-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512F-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512F-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    movq (%rsp,%rdx), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rdx), %rdx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rax, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -1589,52 +1728,56 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512F-NEXT:    movq %r8, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_i512_load:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $128, %rsp
+; AVX512VL-NEXT:    vmovaps (%rsi), %ymm0
 ; AVX512VL-NEXT:    vmovaps 32(%rsi), %xmm1
 ; AVX512VL-NEXT:    movq 48(%rsi), %rax
 ; AVX512VL-NEXT:    movq 56(%rsi), %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    sarq $63, %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %edx, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %edx
 ; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512VL-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512VL-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512VL-NEXT:    movq %rax, %r8
 ; AVX512VL-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512VL-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rdx), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rdx), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    movq (%rsp,%rdx), %rdx
 ; AVX512VL-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
@@ -1646,53 +1789,57 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512VL-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VL-NEXT:    movq %rdx, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_i512_load:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $128, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rsi), %ymm0
 ; AVX512VBMI-NEXT:    vmovaps 32(%rsi), %xmm1
 ; AVX512VBMI-NEXT:    movq 48(%rsi), %rax
 ; AVX512VBMI-NEXT:    movq 56(%rsi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    sarq $63, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %edx, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %edx
 ; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rax
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rdx), %rsi
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rdx), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rdx), %r9
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rdx), %r9
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rdx), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rdx), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rdx), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rdx), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rdx), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rdx), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rdx), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rdx), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    movq (%rsp,%rdx), %rdx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
@@ -1704,9 +1851,10 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %rsi, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %rdx, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -1971,7 +2119,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovaps %xmm0, 32(%rdi)
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, (%rdi)
+; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
 ; AVX2-NEXT:    movq $0, 48(%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1984,7 +2132,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
 ; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovaps %xmm0, 32(%rdi)
 ; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
 ; AVX512F-NEXT:    movq $0, 48(%rdi)
 ; AVX512F-NEXT:    retq
 ;
@@ -1996,7 +2144,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, 32(%rdi)
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rdi)
 ; AVX512VL-NEXT:    movq $0, 48(%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2009,7 +2157,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vmovaps %xmm0, 32(%rdi)
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rdi)
 ; AVX512VBMI-NEXT:    movq $0, 48(%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
@@ -2111,99 +2259,106 @@ define i512 @ashr_i512_511(i512 %a0) nounwind {
 define i512 @shl_1_i512(i512 %a0) nounwind {
 ; SSE-LABEL: shl_1_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
 ; SSE-NEXT:    andl $56, %esi
 ; SSE-NEXT:    negl %esi
 ; SSE-NEXT:    movslq %esi, %rax
-; SSE-NEXT:    movq -56(%rsp,%rax), %rdx
-; SSE-NEXT:    movq -48(%rsp,%rax), %r9
+; SSE-NEXT:    movq 72(%rsp,%rax), %rdx
+; SSE-NEXT:    movq 80(%rsp,%rax), %r9
 ; SSE-NEXT:    movq %r9, %rsi
 ; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    movq -40(%rsp,%rax), %r10
+; SSE-NEXT:    movq 88(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %r8
 ; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -32(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r11
-; SSE-NEXT:    shldq %cl, %r10, %r11
-; SSE-NEXT:    movq -24(%rsp,%rax), %r10
+; SSE-NEXT:    movq 96(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r9
+; SSE-NEXT:    shldq %cl, %r10, %r9
+; SSE-NEXT:    movq 104(%rsp,%rax), %r10
 ; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    movq -16(%rsp,%rax), %r9
-; SSE-NEXT:    movq %r9, %r14
+; SSE-NEXT:    shldq %cl, %r11, %rbx
+; SSE-NEXT:    movq 112(%rsp,%rax), %r11
+; SSE-NEXT:    movq %r11, %r14
 ; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq -8(%rsp,%rax), %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -64(%rsp,%rax), %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    movq 120(%rsp,%rax), %r10
+; SSE-NEXT:    shldq %cl, %r11, %r10
+; SSE-NEXT:    movq 64(%rsp,%rax), %rax
+; SSE-NEXT:    movq %rax, %r11
+; SSE-NEXT:    shlq %cl, %r11
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; SSE-NEXT:    shldq %cl, %rax, %rdx
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movq %r10, 56(%rdi)
 ; SSE-NEXT:    movq %r14, 48(%rdi)
 ; SSE-NEXT:    movq %rbx, 40(%rdi)
-; SSE-NEXT:    movq %r11, 32(%rdi)
+; SSE-NEXT:    movq %r9, 32(%rdi)
 ; SSE-NEXT:    movq %r8, 24(%rdi)
 ; SSE-NEXT:    movq %rsi, 16(%rdi)
 ; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    movq %r9, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    movq %r11, (%rdi)
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: shl_1_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %esi, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %esi
 ; AVX2-NEXT:    andl $56, %esi
 ; AVX2-NEXT:    negl %esi
 ; AVX2-NEXT:    movslq %esi, %r8
-; AVX2-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %rsi
 ; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %r9
 ; AVX2-NEXT:    shldq %cl, %rax, %r9
-; AVX2-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r11
 ; AVX2-NEXT:    shldq %cl, %r10, %r11
-; AVX2-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX2-NEXT:    movq %r10, %rbx
 ; AVX2-NEXT:    shldq %cl, %rax, %rbx
-; AVX2-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX2-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX2-NEXT:    movq %rax, %r14
 ; AVX2-NEXT:    shldq %cl, %r10, %r14
-; AVX2-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX2-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX2-NEXT:    shldq %cl, %rax, %r10
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX2-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX2-NEXT:    shldq %cl, %rdi, %rdx
@@ -2215,47 +2370,51 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    movq %rsi, 16(%rax)
 ; AVX2-NEXT:    movq %rdx, 8(%rax)
 ; AVX2-NEXT:    movq %r8, (%rax)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shl_1_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
 ; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %esi, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %esi
 ; AVX512F-NEXT:    andl $56, %esi
 ; AVX512F-NEXT:    negl %esi
 ; AVX512F-NEXT:    movslq %esi, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT:    movq 80(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %rsi
 ; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 88(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %r9
 ; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 96(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r11
 ; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 104(%rsp,%r8), %r10
 ; AVX512F-NEXT:    movq %r10, %rbx
 ; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq 112(%rsp,%r8), %rax
 ; AVX512F-NEXT:    movq %rax, %r14
 ; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq 120(%rsp,%r8), %r10
 ; AVX512F-NEXT:    shldq %cl, %rax, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT:    movq 64(%rsp,%r8), %rdi
 ; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
@@ -2267,47 +2426,52 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 ; AVX512F-NEXT:    movq %rsi, 16(%rax)
 ; AVX512F-NEXT:    movq %rdx, 8(%rax)
 ; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_1_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %esi, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %esi
 ; AVX512VL-NEXT:    andl $56, %esi
 ; AVX512VL-NEXT:    negl %esi
 ; AVX512VL-NEXT:    movslq %esi, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VL-NEXT:    movq %rax, %rsi
 ; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VL-NEXT:    movq %r10, %r8
 ; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VL-NEXT:    movq %r11, %rbx
 ; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    movq %rdi, %r10
 ; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VL-NEXT:    movq %r14, %r15
 ; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -2320,48 +2484,54 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VL-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VL-NEXT:    movq %r9, (%rax)
+; AVX512VL-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_1_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %esi, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %esi
 ; AVX512VBMI-NEXT:    andl $56, %esi
 ; AVX512VBMI-NEXT:    negl %esi
 ; AVX512VBMI-NEXT:    movslq %esi, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT:    movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT:    movq 80(%rsp,%r9), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %rsi
 ; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT:    movq 88(%rsp,%r9), %r10
 ; AVX512VBMI-NEXT:    movq %r10, %r8
 ; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 96(%rsp,%r9), %r11
 ; AVX512VBMI-NEXT:    movq %r11, %rbx
 ; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 104(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    movq %rdi, %r10
 ; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT:    movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq 112(%rsp,%r9), %r14
 ; AVX512VBMI-NEXT:    movq %r14, %r15
 ; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq 120(%rsp,%r9), %rdi
 ; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
 ; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -2374,9 +2544,11 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 ; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
 ; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
 ; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    leaq -24(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = shl i512 1, %a0
@@ -2386,39 +2558,42 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; SSE-LABEL: lshr_signbit_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
 ; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -112(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq -120(%rsp,%rsi), %rax
+; SSE-NEXT:    movq 16(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq 8(%rsp,%rsi), %rax
 ; SSE-NEXT:    movq %rax, %r8
 ; SSE-NEXT:    shrdq %cl, %rdx, %r8
-; SSE-NEXT:    movq -104(%rsp,%rsi), %r9
+; SSE-NEXT:    movq 24(%rsp,%rsi), %r9
 ; SSE-NEXT:    shrdq %cl, %r9, %rdx
-; SSE-NEXT:    movq -96(%rsp,%rsi), %r10
+; SSE-NEXT:    movq 32(%rsp,%rsi), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r9
-; SSE-NEXT:    movq -88(%rsp,%rsi), %r11
+; SSE-NEXT:    movq 40(%rsp,%rsi), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rsi), %r14
+; SSE-NEXT:    movq 56(%rsp,%rsi), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT:    movq (%rsp,%rsi), %rsi
 ; SSE-NEXT:    shrdq %cl, %rax, %rsi
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -2431,40 +2606,44 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; SSE-NEXT:    movq %rdx, 16(%rdi)
 ; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_signbit_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movl %esi, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %esi
 ; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX2-NEXT:    movq %rax, %r8
 ; AVX2-NEXT:    shrdq %cl, %rdx, %r8
-; AVX2-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX2-NEXT:    shrdq %cl, %r9, %rdx
-; AVX2-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r9
-; AVX2-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT:    movq (%rsp,%rsi), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; AVX2-NEXT:    shrdq %cl, %rsi, %rbx
 ; AVX2-NEXT:    shrdq %cl, %rax, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -2477,39 +2656,43 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    movq %rdx, 16(%rdi)
 ; AVX2-NEXT:    movq %r8, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_signbit_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
 ; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    movl %esi, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %esi
 ; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512F-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512F-NEXT:    movq %rax, %r8
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT:    movq (%rsp,%rsi), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rax, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -2522,41 +2705,45 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; AVX512F-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512F-NEXT:    movq %r8, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_signbit_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $128, %rsp
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    movl %esi, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %esi
 ; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512VL-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512VL-NEXT:    movq %rax, %r8
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rsi), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT:    movq (%rsp,%rsi), %rsi
 ; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
@@ -2568,42 +2755,46 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VL-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VL-NEXT:    movq %rsi, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_signbit_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $128, %rsp
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    movl %esi, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %esi
 ; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rsi), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT:    movq (%rsp,%rsi), %rsi
 ; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
@@ -2615,9 +2806,10 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %s = shl i512 1, 511
@@ -2628,43 +2820,46 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; SSE-LABEL: ashr_signbit_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $128, %rsp
 ; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
 ; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -112(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq -120(%rsp,%rsi), %rax
+; SSE-NEXT:    movq 16(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq 8(%rsp,%rsi), %rax
 ; SSE-NEXT:    movq %rax, %r8
 ; SSE-NEXT:    shrdq %cl, %rdx, %r8
-; SSE-NEXT:    movq -104(%rsp,%rsi), %r9
+; SSE-NEXT:    movq 24(%rsp,%rsi), %r9
 ; SSE-NEXT:    shrdq %cl, %r9, %rdx
-; SSE-NEXT:    movq -96(%rsp,%rsi), %r10
+; SSE-NEXT:    movq 32(%rsp,%rsi), %r10
 ; SSE-NEXT:    shrdq %cl, %r10, %r9
-; SSE-NEXT:    movq -88(%rsp,%rsi), %r11
+; SSE-NEXT:    movq 40(%rsp,%rsi), %r11
 ; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -72(%rsp,%rsi), %r14
+; SSE-NEXT:    movq 56(%rsp,%rsi), %r14
 ; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT:    movq (%rsp,%rsi), %rsi
 ; SSE-NEXT:    shrdq %cl, %rax, %rsi
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -2677,41 +2872,45 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; SSE-NEXT:    movq %rdx, 16(%rdi)
 ; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    leaq -16(%rbp), %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_signbit_i512:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
 ; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movl %esi, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    shrl $3, %esi
 ; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX2-NEXT:    movq %rax, %r8
 ; AVX2-NEXT:    shrdq %cl, %rdx, %r8
-; AVX2-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX2-NEXT:    shrdq %cl, %r9, %rdx
-; AVX2-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX2-NEXT:    shrdq %cl, %r10, %r9
-; AVX2-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX2-NEXT:    shrdq %cl, %r11, %r10
-; AVX2-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT:    movq (%rsp,%rsi), %r14
+; AVX2-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; AVX2-NEXT:    shrdq %cl, %rsi, %rbx
 ; AVX2-NEXT:    shrdq %cl, %rax, %r14
 ; AVX2-NEXT:    movq %rdi, %rax
@@ -2724,39 +2923,43 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    movq %rdx, 16(%rdi)
 ; AVX2-NEXT:    movq %r8, 8(%rdi)
 ; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    leaq -16(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ashr_signbit_i512:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
-; AVX512F-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    movl %esi, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
 ; AVX512F-NEXT:    shrl $3, %esi
 ; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512F-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512F-NEXT:    movq %rax, %r8
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT:    movq (%rsp,%rsi), %r14
+; AVX512F-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
 ; AVX512F-NEXT:    shrdq %cl, %rax, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
@@ -2769,42 +2972,46 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; AVX512F-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512F-NEXT:    movq %r8, 8(%rdi)
 ; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_signbit_i512:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $128, %rsp
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    movl %esi, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
 ; AVX512VL-NEXT:    shrl $3, %esi
 ; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512VL-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512VL-NEXT:    movq %rax, %r8
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT:    movq 56(%rsp,%rsi), %r14
 ; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT:    movq (%rsp,%rsi), %rsi
 ; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
@@ -2816,43 +3023,47 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VL-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VL-NEXT:    movq %rsi, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_signbit_i512:
 ; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $128, %rsp
 ; AVX512VBMI-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    movl %esi, %ecx
 ; AVX512VBMI-NEXT:    andl $63, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %esi
 ; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT:    movq 16(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rsi), %rax
 ; AVX512VBMI-NEXT:    movq %rax, %r8
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT:    movq 24(%rsp,%rsi), %r9
 ; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT:    movq 32(%rsp,%rsi), %r10
 ; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT:    movq 40(%rsp,%rsi), %r11
 ; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT:    movq 48(%rsp,%rsi), %rbx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT:    movq 56(%rsp,%rsi), %r14
 ; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT:    movq (%rsp,%rsi), %rsi
 ; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
@@ -2864,9 +3075,10 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
 ; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
 ; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    leaq -16(%rbp), %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %s = shl i512 1, 511
@@ -2877,130 +3089,150 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 define i64 @lshr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
 ; SSE-LABEL: lshr_extract_i512_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
+; SSE-NEXT:    movq 32(%rbp), %r10
+; SSE-NEXT:    movaps 16(%rbp), %xmm0
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdi, (%rsp)
 ; SSE-NEXT:    movl %r10d, %ecx
 ; SSE-NEXT:    shrl $3, %ecx
 ; SSE-NEXT:    andl $56, %ecx
-; SSE-NEXT:    movq -128(%rsp,%rcx), %rax
-; SSE-NEXT:    movq -120(%rsp,%rcx), %rdx
+; SSE-NEXT:    movq (%rsp,%rcx), %rax
+; SSE-NEXT:    movq 8(%rsp,%rcx), %rdx
 ; SSE-NEXT:    movl %r10d, %ecx
 ; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    movq %rbp, %rsp
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_extract_i512_i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    movq 32(%rbp), %r10
+; AVX2-NEXT:    vmovaps 16(%rbp), %xmm0
 ; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdi, (%rsp)
 ; AVX2-NEXT:    movl %r10d, %ecx
 ; AVX2-NEXT:    shrl $3, %ecx
 ; AVX2-NEXT:    andl $56, %ecx
-; AVX2-NEXT:    movq -128(%rsp,%rcx), %rax
-; AVX2-NEXT:    movq -120(%rsp,%rcx), %rdx
+; AVX2-NEXT:    movq (%rsp,%rcx), %rax
+; AVX2-NEXT:    movq 8(%rsp,%rcx), %rdx
 ; AVX2-NEXT:    movl %r10d, %ecx
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rax
-; AVX2-NEXT:    popq %rcx
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_extract_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
+; AVX512F-NEXT:    movq 32(%rbp), %r10
+; AVX512F-NEXT:    vmovaps 16(%rbp), %xmm0
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdi, (%rsp)
 ; AVX512F-NEXT:    movl %r10d, %ecx
 ; AVX512F-NEXT:    shrl $3, %ecx
 ; AVX512F-NEXT:    andl $56, %ecx
-; AVX512F-NEXT:    movq -128(%rsp,%rcx), %rax
-; AVX512F-NEXT:    movq -120(%rsp,%rcx), %rdx
+; AVX512F-NEXT:    movq (%rsp,%rcx), %rax
+; AVX512F-NEXT:    movq 8(%rsp,%rcx), %rdx
 ; AVX512F-NEXT:    movl %r10d, %ecx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_extract_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    movq 32(%rbp), %r10
+; AVX512VL-NEXT:    vmovaps 16(%rbp), %xmm0
 ; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdi, (%rsp)
 ; AVX512VL-NEXT:    movl %r10d, %ecx
 ; AVX512VL-NEXT:    shrl $3, %ecx
 ; AVX512VL-NEXT:    andl $56, %ecx
-; AVX512VL-NEXT:    movq -128(%rsp,%rcx), %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%rcx), %rdx
+; AVX512VL-NEXT:    movq (%rsp,%rcx), %rax
+; AVX512VL-NEXT:    movq 8(%rsp,%rcx), %rdx
 ; AVX512VL-NEXT:    movl %r10d, %ecx
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    movq %rbp, %rsp
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_extract_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    movq 32(%rbp), %r10
+; AVX512VBMI-NEXT:    vmovaps 16(%rbp), %xmm0
 ; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, (%rsp)
 ; AVX512VBMI-NEXT:    movl %r10d, %ecx
 ; AVX512VBMI-NEXT:    shrl $3, %ecx
 ; AVX512VBMI-NEXT:    andl $56, %ecx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rcx), %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rcx), %rdx
+; AVX512VBMI-NEXT:    movq (%rsp,%rcx), %rax
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rcx), %rdx
 ; AVX512VBMI-NEXT:    movl %r10d, %ecx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %b = lshr i512 %a0, %a1
@@ -3011,36 +3243,40 @@ define i64 @lshr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
 define i64 @ashr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
 ; CHECK-LABEL: ashr_extract_i512_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $160, %rsp
 ; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 32(%rbp), %rcx
+; CHECK-NEXT:    movq 16(%rbp), %r10
+; CHECK-NEXT:    movq 24(%rbp), %r11
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdi, (%rsp)
 ; CHECK-NEXT:    sarq $63, %r11
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    shrl $3, %edx
 ; CHECK-NEXT:    andl $56, %edx
-; CHECK-NEXT:    movq -128(%rsp,%rdx), %rax
-; CHECK-NEXT:    movq -120(%rsp,%rdx), %rdx
+; CHECK-NEXT:    movq (%rsp,%rdx), %rax
+; CHECK-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrdq %cl, %rdx, %rax
-; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %b = ashr i512 %a0, %a1
   %r = trunc i512 %b to i64
@@ -3050,112 +3286,132 @@ define i64 @ashr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
 define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: lshr_extract_load_i512_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
 ; SSE-NEXT:    movq %rsi, %rcx
 ; SSE-NEXT:    movaps (%rdi), %xmm0
 ; SSE-NEXT:    movaps 16(%rdi), %xmm1
 ; SSE-NEXT:    movaps 32(%rdi), %xmm2
 ; SSE-NEXT:    movaps 48(%rdi), %xmm3
 ; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movl %ecx, %edx
 ; SSE-NEXT:    shrl $3, %edx
 ; SSE-NEXT:    andl $56, %edx
-; SSE-NEXT:    movq -128(%rsp,%rdx), %rax
-; SSE-NEXT:    movq -120(%rsp,%rdx), %rdx
+; SSE-NEXT:    movq (%rsp,%rdx), %rax
+; SSE-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    movq %rbp, %rsp
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_extract_load_i512_i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
 ; AVX2-NEXT:    movq %rsi, %rcx
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movl %ecx, %edx
 ; AVX2-NEXT:    shrl $3, %edx
 ; AVX2-NEXT:    andl $56, %edx
-; AVX2-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX2-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX2-NEXT:    movq (%rsp,%rdx), %rax
+; AVX2-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rax
-; AVX2-NEXT:    popq %rcx
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_extract_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
 ; AVX512F-NEXT:    movq %rsi, %rcx
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    movl %ecx, %edx
 ; AVX512F-NEXT:    shrl $3, %edx
 ; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512F-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_extract_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX512VL-NEXT:    movq %rsi, %rcx
 ; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    movl %ecx, %edx
 ; AVX512VL-NEXT:    shrl $3, %edx
 ; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512VL-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    movq %rbp, %rsp
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_extract_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX512VBMI-NEXT:    movq %rsi, %rcx
 ; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    movl %ecx, %edx
 ; AVX512VBMI-NEXT:    shrl $3, %edx
 ; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3167,161 +3423,181 @@ define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: ashr_extract_load_i512_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
 ; SSE-NEXT:    movq %rsi, %rcx
 ; SSE-NEXT:    movaps (%rdi), %xmm0
 ; SSE-NEXT:    movaps 16(%rdi), %xmm1
 ; SSE-NEXT:    movaps 32(%rdi), %xmm2
 ; SSE-NEXT:    movq 48(%rdi), %rax
 ; SSE-NEXT:    movq 56(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    sarq $63, %rdx
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %ecx, %edx
 ; SSE-NEXT:    shrl $3, %edx
 ; SSE-NEXT:    andl $56, %edx
-; SSE-NEXT:    movq -128(%rsp,%rdx), %rax
-; SSE-NEXT:    movq -120(%rsp,%rdx), %rdx
+; SSE-NEXT:    movq (%rsp,%rdx), %rax
+; SSE-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    movq %rbp, %rsp
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_extract_load_i512_i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
 ; AVX2-NEXT:    movq %rsi, %rcx
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
+; AVX2-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX2-NEXT:    movq 48(%rdi), %rax
 ; AVX2-NEXT:    movq 56(%rdi), %rdx
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    sarq $63, %rdx
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movl %ecx, %edx
 ; AVX2-NEXT:    shrl $3, %edx
 ; AVX2-NEXT:    andl $56, %edx
-; AVX2-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX2-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX2-NEXT:    movq (%rsp,%rdx), %rax
+; AVX2-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX2-NEXT:    shrdq %cl, %rdx, %rax
-; AVX2-NEXT:    popq %rcx
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ashr_extract_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
 ; AVX512F-NEXT:    movq %rsi, %rcx
-; AVX512F-NEXT:    vmovups (%rdi), %ymm0
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512F-NEXT:    movq 48(%rdi), %rax
 ; AVX512F-NEXT:    movq 56(%rdi), %rdx
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512F-NEXT:    sarq $63, %rdx
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %ecx, %edx
 ; AVX512F-NEXT:    shrl $3, %edx
 ; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512F-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_extract_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
 ; AVX512VL-NEXT:    movq %rsi, %rcx
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512VL-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512VL-NEXT:    movq 48(%rdi), %rax
 ; AVX512VL-NEXT:    movq 56(%rdi), %rdx
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    sarq $63, %rdx
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %ecx, %edx
 ; AVX512VL-NEXT:    shrl $3, %edx
 ; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512VL-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    movq %rbp, %rsp
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_extract_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
 ; AVX512VBMI-NEXT:    movq %rsi, %rcx
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512VBMI-NEXT:    movq 48(%rdi), %rax
 ; AVX512VBMI-NEXT:    movq 56(%rdi), %rdx
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    sarq $63, %rdx
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movl %ecx, %edx
 ; AVX512VBMI-NEXT:    shrl $3, %edx
 ; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    movq (%rsp,%rdx), %rax
+; AVX512VBMI-NEXT:    movq 8(%rsp,%rdx), %rdx
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3333,82 +3609,102 @@ define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: lshr_extract_idx_load_i512_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
 ; SSE-NEXT:    movaps (%rdi), %xmm0
 ; SSE-NEXT:    movaps 16(%rdi), %xmm1
 ; SSE-NEXT:    movaps 32(%rdi), %xmm2
 ; SSE-NEXT:    movaps 48(%rdi), %xmm3
 ; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    andl $7, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    movq (%rsp,%rsi,8), %rax
+; SSE-NEXT:    movq %rbp, %rsp
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    andl $7, %esi
-; AVX2-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX2-NEXT:    popq %rcx
+; AVX2-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp)
 ; AVX512F-NEXT:    andl $7, %esi
-; AVX512F-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    andl $7, %esi
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512VL-NEXT:    movq %rbp, %rsp
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VBMI-NEXT:    andl $7, %esi
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3421,131 +3717,151 @@ define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 define i64 @ashr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ; SSE-LABEL: ashr_extract_idx_load_i512_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    movq %rsp, %rbp
+; SSE-NEXT:    andq $-32, %rsp
+; SSE-NEXT:    subq $160, %rsp
 ; SSE-NEXT:    movaps (%rdi), %xmm0
 ; SSE-NEXT:    movaps 16(%rdi), %xmm1
 ; SSE-NEXT:    movaps 32(%rdi), %xmm2
 ; SSE-NEXT:    movq 48(%rdi), %rax
 ; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    sarq $63, %rcx
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    andl $7, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    movq (%rsp,%rsi,8), %rax
+; SSE-NEXT:    movq %rbp, %rsp
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX2-NEXT:    movq 48(%rdi), %rax
 ; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    sarq $63, %rcx
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    andl $7, %esi
-; AVX2-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX2-NEXT:    popq %rcx
+; AVX2-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rdi), %ymm0
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $160, %rsp
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512F-NEXT:    movq 48(%rdi), %rax
 ; AVX512F-NEXT:    movq 56(%rdi), %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    andl $7, %esi
-; AVX512F-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
+; AVX512VL-NEXT:    pushq %rbp
+; AVX512VL-NEXT:    movq %rsp, %rbp
+; AVX512VL-NEXT:    andq $-32, %rsp
+; AVX512VL-NEXT:    subq $160, %rsp
+; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512VL-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512VL-NEXT:    movq 48(%rdi), %rax
 ; AVX512VL-NEXT:    movq 56(%rdi), %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    sarq $63, %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    andl $7, %esi
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512VL-NEXT:    movq %rbp, %rsp
+; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-32, %rsp
+; AVX512VBMI-NEXT:    subq $160, %rsp
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX512VBMI-NEXT:    movq 48(%rdi), %rax
 ; AVX512VBMI-NEXT:    movq 56(%rdi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    sarq $63, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    andl $7, %esi
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    movq (%rsp,%rsi,8), %rax
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index df167338268c4..672aacc4771d0 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -696,143 +696,117 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-LABEL: smul_ovf:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $192, %rsp
+; X64-NEXT:    movq %r9, %r13
+; X64-NEXT:    movq %r8, %r9
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rsi, %r12
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    andl $1, %r13d
-; X64-NEXT:    negq %r13
 ; X64-NEXT:    andl $1, %r14d
 ; X64-NEXT:    negq %r14
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rdx, %r12
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    addq %rax, %r12
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq %r12, %rcx
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r10, %rbp
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %rsi
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ebp
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    adcq %rbp, %r8
-; X64-NEXT:    addq %rdi, %r10
-; X64-NEXT:    adcq %r12, %r8
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %r9, %rbx
-; X64-NEXT:    adcq %rsi, %r15
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %r9, %r15
-; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    addq %r9, %r10
-; X64-NEXT:    adcq %r8, %rbx
-; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq 16(%rbp), %r15
+; X64-NEXT:    andl $1, %r15d
+; X64-NEXT:    negq %r15
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    movq %r14, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    movq %r14, %r8
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    addq %r11, %r15
-; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    setb %al
-; X64-NEXT:    addq %r8, %r11
-; X64-NEXT:    movzbl %al, %r12d
-; X64-NEXT:    adcq %rdx, %r12
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    imulq %r14
-; X64-NEXT:    addq %rax, %rax
-; X64-NEXT:    adcq %rdx, %rdx
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %r12, %rdx
-; X64-NEXT:    addq %r8, %r15
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    adcq %r9, %rax
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    sarq $63, %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    sarq $63, %r8
+; X64-NEXT:    addq %r9, %rdx
+; X64-NEXT:    adcq %rsi, %rdi
+; X64-NEXT:    movq %r10, %r9
+; X64-NEXT:    adcq %r8, %r9
+; X64-NEXT:    adcq %r10, %r8
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq %rcx, %rsi
 ; X64-NEXT:    sarq $63, %rsi
-; X64-NEXT:    xorq %rsi, %rax
-; X64-NEXT:    xorq %rsi, %r15
-; X64-NEXT:    orq %rax, %r15
-; X64-NEXT:    xorq %rsi, %rdx
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    orq %rdx, %rsi
-; X64-NEXT:    orq %r15, %rsi
-; X64-NEXT:    movl %r10d, %edx
+; X64-NEXT:    xorq %rsi, %rdi
+; X64-NEXT:    xorq %rsi, %r8
+; X64-NEXT:    orq %rdi, %r8
+; X64-NEXT:    xorq %rsi, %r9
+; X64-NEXT:    xorq %rdx, %rsi
+; X64-NEXT:    orq %r9, %rsi
+; X64-NEXT:    orq %r8, %rsi
+; X64-NEXT:    movl %r11d, %edx
 ; X64-NEXT:    andl $1, %edx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorq %rcx, %rbx
-; X64-NEXT:    xorq %r10, %rcx
-; X64-NEXT:    orq %rbx, %rcx
-; X64-NEXT:    orq %rsi, %rcx
+; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, 8(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%rax)
+; X64-NEXT:    movaps %xmm0, (%rax)
 ; X64-NEXT:    movb %dl, 16(%rax)
+; X64-NEXT:    negq %rdx
+; X64-NEXT:    xorq %rdx, %rcx
+; X64-NEXT:    xorq %r11, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    orq %rsi, %rdx
 ; X64-NEXT:    setne 32(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 13596e1b18768..631c3b3c44358 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -296,215 +296,121 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    .cfi_def_cfa_register %rbp
 ; X64-NEXT:    pushq %r15
-; X64-NEXT:    .cfi_def_cfa_offset 24
 ; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    pushq %r13
-; X64-NEXT:    .cfi_def_cfa_offset 40
 ; X64-NEXT:    pushq %r12
-; X64-NEXT:    .cfi_def_cfa_offset 48
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 56
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $192, %rsp
 ; X64-NEXT:    .cfi_offset %rbx, -56
 ; X64-NEXT:    .cfi_offset %r12, -48
 ; X64-NEXT:    .cfi_offset %r13, -40
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
-; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    movq %rcx, %rbx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %r9, %r14
+; X64-NEXT:    movq %r8, %r13
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq %rcx, %r8
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    imulq %rcx, %rsi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    addq %rsi, %r15
-; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    adcq %r8, %r15
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rsi, %r12
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    addq %r12, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rdi, %rbx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rbx, %rsi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %rbp
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    movq %rcx, %r12
 ; X64-NEXT:    sarq $63, %r12
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r13, %rdi
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq %r11, %rdi
-; X64-NEXT:    movq %r11, %r8
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    imulq %rdi, %r10
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    adcq %r9, %r11
-; X64-NEXT:    addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT:    adcq %rbp, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r11, %rbp
-; X64-NEXT:    sarq $63, %rbp
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    adcq %rbp, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %rbp
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    imulq %rcx, %r8
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    movq %r12, %r8
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq ___multi5
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq ___multi5
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    movq 24(%rbp), %rax
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    adcq %rsi, %r10
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq 16(%rbp), %r13
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r15, %rdi
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    adcq %r11, %rdi
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    orq %rax, %rsi
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    xorq %rdi, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    orq %rsi, %rcx
+; X64-NEXT:    callq ___multi5
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    movq %r12, %r8
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    callq ___multi5
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %r8, 24(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, (%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 8(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 16(%rax)
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    sarq $63, %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    addq %r9, %rdx
+; X64-NEXT:    adcq %r8, %rsi
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    adcq %r10, %rdi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq 32(%rbp), %r9
+; X64-NEXT:    movq %rax, 24(%r9)
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    xorq %rax, %r8
+; X64-NEXT:    xorq %rdx, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    orq %rdi, %rax
 ; X64-NEXT:    setne %al
+; X64-NEXT:    movaps %xmm0, (%r9)
+; X64-NEXT:    movq %rcx, 16(%r9)
+; X64-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/udivmodei5.ll b/llvm/test/CodeGen/X86/udivmodei5.ll
index 2c30357180e40..dd76235e29eef 100644
--- a/llvm/test/CodeGen/X86/udivmodei5.ll
+++ b/llvm/test/CodeGen/X86/udivmodei5.ll
@@ -1,10 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
 
 ; On i686, this is expanded into a loop. On x86_64, this calls __udivti3.
 define i65 @udiv65(i65 %a, i65 %b) nounwind {
 ; X86-LABEL: udiv65:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    sete (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    shldl $31, %edx, %edi
+; X86-NEXT:    shldl $31, %ebx, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB0_3: # %_udiv-special-cases
+; X86-NEXT:    shll $31, %ebx
+; X86-NEXT:    movl $64, %esi
+; X86-NEXT:    jne .LBB0_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %ebx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    je .LBB0_7
+; X86-NEXT:    jmp .LBB0_8
+; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    bsrl %ebx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    jne .LBB0_8
+; X86-NEXT:  .LBB0_7: # %_udiv-special-cases
+; X86-NEXT:    addl $64, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:  .LBB0_8: # %_udiv-special-cases
+; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shldl $31, %ebx, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB0_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:  .LBB0_9:
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:  .LBB0_11: # %_udiv-special-cases
+; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT:    shll $31, %ebx
+; X86-NEXT:    je .LBB0_13
+; X86-NEXT:  # %bb.12:
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB0_13: # %_udiv-special-cases
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB0_15
+; X86-NEXT:  # %bb.14: # %_udiv-special-cases
+; X86-NEXT:    addl $64, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:  .LBB0_15: # %_udiv-special-cases
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    subl %edi, %eax
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    jne .LBB0_16
+; X86-NEXT:  # %bb.17: # %select.false.sink
+; X86-NEXT:    movl $64, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB0_18: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    jne .LBB0_20
+; X86-NEXT:  # %bb.19: # %select.end
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:  .LBB0_20: # %select.end
+; X86-NEXT:    jne .LBB0_21
+; X86-NEXT:  # %bb.27: # %select.end
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $64, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    je .LBB0_28
+; X86-NEXT:  # %bb.25: # %udiv-bb1
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb $64, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NEXT:    movl 100(%esp,%eax), %esi
+; X86-NEXT:    movl 104(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    je .LBB0_26
+; X86-NEXT:  # %bb.22: # %udiv-preheader
+; X86-NEXT:    andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 56(%esp,%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%edx), %eax
+; X86-NEXT:    movl 52(%esp,%edx), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $1, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB0_23: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    andl 24(%ebp), %ebx
+; X86-NEXT:    andl 20(%ebp), %esi
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    adcl $1, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    jne .LBB0_23
+; X86-NEXT:  .LBB0_24: # %udiv-loop-exit
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    shrl $31, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal (%edx,%eax,2), %edx
+; X86-NEXT:  .LBB0_28: # %udiv-end
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_16:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB0_18
+; X86-NEXT:  .LBB0_26:
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB0_24
+; X86-NEXT:  .LBB0_21:
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    jmp .LBB0_28
 ;
 ; X64-LABEL: udiv65:
 ; X64:       # %bb.0:
@@ -18,53 +303,4712 @@ define i65 @udiv65(i65 %a, i65 %b) nounwind {
   ret i65 %res
 }
 
+; On both i686 and x86_64, i129 division is expanded inline (ExpandLargeDivRem).
+; MaxDivRemBitWidthSupported=128 on x86_64, so i129 exceeds the DAG limit.
 define i129 @udiv129(i129 %a, i129 %b) nounwind {
 ; X86-LABEL: udiv129:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    orl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    orl 24(%ebp), %ecx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    orl 20(%ebp), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    shldl $31, %edx, %eax
+; X86-NEXT:    shldl $31, %ebx, %edx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:    jmp .LBB1_3
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    bsrl %eax, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:  .LBB1_3: # %_udiv-special-cases
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB1_6
+; X86-NEXT:  .LBB1_4:
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB1_6: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    jne .LBB1_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB1_8: # %_udiv-special-cases
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    orb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    je .LBB1_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    je .LBB1_11
+; X86-NEXT:  .LBB1_12: # %_udiv-special-cases
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    jne .LBB1_14
+; X86-NEXT:  .LBB1_13: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB1_14: # %_udiv-special-cases
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_15
+; X86-NEXT:  # %bb.16: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB1_17
+; X86-NEXT:  .LBB1_9: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %ecx
+; X86-NEXT:    jne .LBB1_12
+; X86-NEXT:  .LBB1_11: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %ecx
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    je .LBB1_13
+; X86-NEXT:    jmp .LBB1_14
+; X86-NEXT:  .LBB1_15:
+; X86-NEXT:    bsrl %ecx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB1_17: # %_udiv-special-cases
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    shldl $31, %ecx, %ebx
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB1_18
+; X86-NEXT:  # %bb.19: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB1_20
+; X86-NEXT:  .LBB1_18:
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB1_20: # %_udiv-special-cases
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB1_22
+; X86-NEXT:  # %bb.21: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB1_22: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    jne .LBB1_23
+; X86-NEXT:  # %bb.24: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    je .LBB1_26
+; X86-NEXT:    jmp .LBB1_27
+; X86-NEXT:  .LBB1_23:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    jne .LBB1_27
+; X86-NEXT:  .LBB1_26: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB1_27: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_28
+; X86-NEXT:  # %bb.29: # %select.false.sink
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB1_30: # %select.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    jne .LBB1_32
+; X86-NEXT:  # %bb.31: # %select.end
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:  .LBB1_32: # %select.end
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB1_33
+; X86-NEXT:  # %bb.39: # %select.end
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    xorl $128, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    je .LBB1_40
+; X86-NEXT:  # %bb.37: # %udiv-bb1
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb $-128, %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 200(%esp,%edx), %eax
+; X86-NEXT:    movl 204(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl 196(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 192(%esp,%edx), %eax
+; X86-NEXT:    movl 208(%esp,%edx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB1_38
+; X86-NEXT:  # %bb.34: # %udiv-preheader
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 112(%esp,%edx,4), %eax
+; X86-NEXT:    movl 108(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl 104(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx,4), %eax
+; X86-NEXT:    movl 100(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $1, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB1_35: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    leal (%eax,%edx,2), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    andl 44(%ebp), %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl 40(%ebp), %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    andl 36(%ebp), %esi
+; X86-NEXT:    andl 32(%ebp), %ebx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB1_35
+; X86-NEXT:  .LBB1_36: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    leal (%ebx,%edi,2), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB1_40: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 16(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB1_28:
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB1_30
+; X86-NEXT:  .LBB1_38:
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB1_36
+; X86-NEXT:  .LBB1_33:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jmp .LBB1_40
 ;
 ; X64-LABEL: udiv129:
-; X64-NOT:     call
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $224, %rsp
+; X64-NEXT:    movq %rcx, %r13
+; X64-NEXT:    movl %edx, %r14d
+; X64-NEXT:    andl $1, %r14d
+; X64-NEXT:    movl %r9d, %r12d
+; X64-NEXT:    andl $1, %r12d
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    orq %r12, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %r14, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    shldq $63, %r8, %r9
+; X64-NEXT:    bsrq %r9, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    shldq $63, %r13, %r11
+; X64-NEXT:    bsrq %r11, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmovneq %rcx, %r10
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    shlq $63, %rcx
+; X64-NEXT:    bsrq %rcx, %r15
+; X64-NEXT:    xorq $63, %r15
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    movl $128, %ecx
+; X64-NEXT:    cmoveq %rcx, %r15
+; X64-NEXT:    subq $-128, %r15
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    cmovneq %r10, %r15
+; X64-NEXT:    shldq $63, %rsi, %rdx
+; X64-NEXT:    bsrq %rdx, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    shldq $63, %rdi, %r9
+; X64-NEXT:    bsrq %r9, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %r11, %r10
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    shlq $63, %rbx
+; X64-NEXT:    bsrq %rbx, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    cmoveq %rcx, %r11
+; X64-NEXT:    subq $-128, %r11
+; X64-NEXT:    orq %rdx, %r9
+; X64-NEXT:    cmovneq %r10, %r11
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    subq %r11, %r15
+; X64-NEXT:    movl $0, %ebx
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB1_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %r15, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %rbx, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r9, %rcx
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB1_3: # %select.end
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    cmovneq %rcx, %rdx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    cmoveq %r14, %rcx
+; X64-NEXT:    jne .LBB1_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %r10
+; X64-NEXT:    xorq $128, %r10
+; X64-NEXT:    orq %r9, %r10
+; X64-NEXT:    orq %rbx, %r10
+; X64-NEXT:    je .LBB1_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    addq $1, %rdx
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb $-128, %cl
+; X64-NEXT:    subb %r15b, %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %r10
+; X64-NEXT:    movq 160(%rsp,%r10), %rax
+; X64-NEXT:    movq 168(%rsp,%r10), %r12
+; X64-NEXT:    movq 176(%rsp,%r10), %r11
+; X64-NEXT:    shldq %cl, %r12, %r11
+; X64-NEXT:    shldq %cl, %rax, %r12
+; X64-NEXT:    shlq %cl, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    orq %rbx, %rcx
+; X64-NEXT:    je .LBB1_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movq 80(%rsp,%rcx,8), %rdi
+; X64-NEXT:    movq 64(%rsp,%rcx,8), %rsi
+; X64-NEXT:    movq 72(%rsp,%rcx,8), %r10
+; X64-NEXT:    movq %r10, %r14
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shrdq %cl, %rdi, %r14
+; X64-NEXT:    shrdq %cl, %r10, %rsi
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $1, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    xorl %r13d, %r13d
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    movq %r14, %r10
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB1_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %rsi, %r10
+; X64-NEXT:    shrq $63, %r14
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    leaq (%r11,%rsi,2), %rsi
+; X64-NEXT:    shldq $1, %rax, %rcx
+; X64-NEXT:    orq %rdi, %rcx
+; X64-NEXT:    shrq $63, %r12
+; X64-NEXT:    addq %rax, %rax
+; X64-NEXT:    orq %r13, %rax
+; X64-NEXT:    orl %r12d, %r15d
+; X64-NEXT:    movl %r15d, %r11d
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    cmpq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %r10, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %r14, %rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    movl %edi, %r13d
+; X64-NEXT:    andl $1, %r13d
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    sbbq %r8, %r10
+; X64-NEXT:    addq $-1, %rdx
+; X64-NEXT:    adcq $-1, %rbx
+; X64-NEXT:    adcq $1, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    orq %r9, %rdi
+; X64-NEXT:    orq %rbx, %rdi
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    movq %r10, %r14
+; X64-NEXT:    movq %rcx, %r12
+; X64-NEXT:    jne .LBB1_7
+; X64-NEXT:  .LBB1_8: # %udiv-loop-exit
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    shldq $1, %rax, %rdx
+; X64-NEXT:    shrq $63, %rcx
+; X64-NEXT:    leaq (%r13,%rax,2), %rax
+; X64-NEXT:  .LBB1_9: # %udiv-end
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:  .LBB1_10:
+; X64-NEXT:    xorl %r13d, %r13d
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    jmp .LBB1_8
   %res = udiv i129 %a, %b
   ret i129 %res
 }
 
 define i129 @urem129(i129 %a, i129 %b) nounwind {
 ; X86-LABEL: urem129:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    orl 44(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    orl 24(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    orl 20(%ebp), %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    shldl $31, %edi, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB2_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB2_3
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB2_3: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB2_6
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB2_6: # %_udiv-special-cases
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB2_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB2_8: # %_udiv-special-cases
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    je .LBB2_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    je .LBB2_11
+; X86-NEXT:  .LBB2_12: # %_udiv-special-cases
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    jne .LBB2_14
+; X86-NEXT:  .LBB2_13: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB2_14: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_15
+; X86-NEXT:  # %bb.16: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB2_17
+; X86-NEXT:  .LBB2_9: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB2_12
+; X86-NEXT:  .LBB2_11: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    je .LBB2_13
+; X86-NEXT:    jmp .LBB2_14
+; X86-NEXT:  .LBB2_15:
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB2_17: # %_udiv-special-cases
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    shldl $31, %eax, %ebx
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB2_18
+; X86-NEXT:  # %bb.19: # %_udiv-special-cases
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB2_20
+; X86-NEXT:  .LBB2_18:
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB2_20: # %_udiv-special-cases
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB2_22
+; X86-NEXT:  # %bb.21: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB2_22: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    jne .LBB2_23
+; X86-NEXT:  # %bb.24: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %ecx
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    je .LBB2_26
+; X86-NEXT:    jmp .LBB2_27
+; X86-NEXT:  .LBB2_23:
+; X86-NEXT:    bsrl %ecx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    jne .LBB2_27
+; X86-NEXT:  .LBB2_26: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:  .LBB2_27: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_28
+; X86-NEXT:  # %bb.29: # %select.false.sink
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB2_30: # %select.end
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    jne .LBB2_32
+; X86-NEXT:  # %bb.31: # %select.end
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:  .LBB2_32: # %select.end
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_38
+; X86-NEXT:  # %bb.33: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $128, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB2_38
+; X86-NEXT:  # %bb.34: # %udiv-bb1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb $-128, %cl
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 200(%esp,%eax), %esi
+; X86-NEXT:    movl 204(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl 196(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 192(%esp,%eax), %esi
+; X86-NEXT:    movl 208(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB2_39
+; X86-NEXT:  # %bb.35: # %udiv-preheader
+; X86-NEXT:    andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 112(%esp,%eax,4), %edx
+; X86-NEXT:    movl 108(%esp,%eax,4), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl 104(%esp,%eax,4), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%eax,4), %edx
+; X86-NEXT:    movl 100(%esp,%eax,4), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB2_36: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    leal (%edx,%eax,2), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    andl 44(%ebp), %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl 40(%ebp), %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    andl 36(%ebp), %edx
+; X86-NEXT:    andl 32(%ebp), %edi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    jne .LBB2_36
+; X86-NEXT:  .LBB2_37: # %udiv-loop-exit
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    leal (%esi,%eax,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:  .LBB2_38: # %udiv-end
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    imull 44(%ebp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    imull 36(%ebp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull 40(%ebp), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull 32(%ebp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movb %bl, 16(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB2_28:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    jmp .LBB2_30
+; X86-NEXT:  .LBB2_39:
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB2_37
 ;
 ; X64-LABEL: urem129:
-; X64-NOT:     call
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $224, %rsp
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    movl %edx, %r13d
+; X64-NEXT:    andl $1, %r13d
+; X64-NEXT:    movl %r9d, %esi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    sete %sil
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %r13, %rax
+; X64-NEXT:    orq %r10, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    shldq $63, %r8, %r9
+; X64-NEXT:    bsrq %r9, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    shldq $63, %rcx, %rsi
+; X64-NEXT:    bsrq %rsi, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    orq $64, %rdi
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmovneq %r8, %rdi
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    shlq $63, %r8
+; X64-NEXT:    bsrq %r8, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    movl $128, %r11d
+; X64-NEXT:    cmoveq %r11, %rbx
+; X64-NEXT:    subq $-128, %rbx
+; X64-NEXT:    orq %r9, %rsi
+; X64-NEXT:    cmovneq %rdi, %rbx
+; X64-NEXT:    shldq $63, %r10, %rdx
+; X64-NEXT:    bsrq %rdx, %rsi
+; X64-NEXT:    xorq $63, %rsi
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    shldq $63, %r15, %rdi
+; X64-NEXT:    bsrq %rdi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    orq $64, %r8
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %rsi, %r8
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    shlq $63, %rsi
+; X64-NEXT:    bsrq %rsi, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmoveq %r11, %r9
+; X64-NEXT:    subq $-128, %r9
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    cmovneq %r8, %r9
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    subq %r9, %rbx
+; X64-NEXT:    movl $0, %r9d
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    sbbq %rdx, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB2_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rbx, %r11
+; X64-NEXT:    movl $0, %esi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    movl $0, %esi
+; X64-NEXT:    sbbq %rdx, %rsi
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB2_3: # %select.end
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %r10, %rsi
+; X64-NEXT:    cmovneq %rdi, %rsi
+; X64-NEXT:    movq %r15, %r8
+; X64-NEXT:    cmovneq %rdi, %r8
+; X64-NEXT:    cmoveq %r13, %rdi
+; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; X64-NEXT:    jne .LBB2_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    xorq $128, %rax
+; X64-NEXT:    orq %rdx, %rax
+; X64-NEXT:    orq %r9, %rax
+; X64-NEXT:    je .LBB2_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    addq $1, %rax
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb $-128, %cl
+; X64-NEXT:    subb %bl, %cl
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    andb $24, %sil
+; X64-NEXT:    negb %sil
+; X64-NEXT:    movsbq %sil, %rsi
+; X64-NEXT:    movq 160(%rsp,%rsi), %rbx
+; X64-NEXT:    movq 168(%rsp,%rsi), %r12
+; X64-NEXT:    movq 176(%rsp,%rsi), %r11
+; X64-NEXT:    shldq %cl, %r12, %r11
+; X64-NEXT:    shldq %cl, %rbx, %r12
+; X64-NEXT:    shlq %cl, %rbx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    je .LBB2_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movq 80(%rsp,%rcx,8), %rsi
+; X64-NEXT:    movq 64(%rsp,%rcx,8), %r13
+; X64-NEXT:    movq 72(%rsp,%rcx,8), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NEXT:    shrdq %cl, %rdi, %r13
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    adcq $1, %rsi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB2_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %r13, %r8
+; X64-NEXT:    shrq $63, %r10
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    leaq (%r11,%r13,2), %r13
+; X64-NEXT:    shldq $1, %rbx, %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    shrq $63, %r12
+; X64-NEXT:    addq %rbx, %rbx
+; X64-NEXT:    orq %r15, %rbx
+; X64-NEXT:    orl %r12d, %r14d
+; X64-NEXT:    movl %r14d, %r11d
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    cmpq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r8, %rcx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    sbbq %r10, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    movl %ecx, %r15d
+; X64-NEXT:    andl $1, %r15d
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    andq (%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    subq %rcx, %r13
+; X64-NEXT:    sbbq %r10, %r8
+; X64-NEXT:    addq $-1, %rax
+; X64-NEXT:    adcq $-1, %r9
+; X64-NEXT:    adcq $1, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    jne .LBB2_7
+; X64-NEXT:  .LBB2_8: # %udiv-loop-exit
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    shldq $1, %rbx, %rsi
+; X64-NEXT:    shrq $63, %rdi
+; X64-NEXT:    leaq (%r15,%rbx,2), %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:  .LBB2_9: # %udiv-end
+; X64-NEXT:    movq (%rsp), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    imulq %rsi, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    imulq %r12, %rdi
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    addq %r11, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    subq %rcx, %r15
+; X64-NEXT:    sbbq %rax, %r10
+; X64-NEXT:    sbbq %rdi, %r13
+; X64-NEXT:    andl $1, %r13d
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r10, %rdx
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB2_3
+; X64-NEXT:  .LBB2_10:
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    jmp .LBB2_8
   %res = urem i129 %a, %b
   ret i129 %res
 }
 
 define i129 @sdiv129(i129 %a, i129 %b) nounwind {
 ; X86-LABEL: sdiv129:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $256, %esp # imm = 0x100
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    shldl $31, %edi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB3_3
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB3_3: # %_udiv-special-cases
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %ebx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB3_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB3_6
+; X86-NEXT:  .LBB3_4:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB3_6: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB3_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB3_8: # %_udiv-special-cases
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    je .LBB3_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    je .LBB3_11
+; X86-NEXT:  .LBB3_12: # %_udiv-special-cases
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB3_14
+; X86-NEXT:  .LBB3_13: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:  .LBB3_14: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $31, %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $31, %edi, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_15
+; X86-NEXT:  # %bb.16: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB3_17
+; X86-NEXT:  .LBB3_9: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jne .LBB3_12
+; X86-NEXT:  .LBB3_11: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %ecx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    je .LBB3_13
+; X86-NEXT:    jmp .LBB3_14
+; X86-NEXT:  .LBB3_15:
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB3_17: # %_udiv-special-cases
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %ebx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB3_18
+; X86-NEXT:  # %bb.19: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB3_20
+; X86-NEXT:  .LBB3_18:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB3_20: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB3_22
+; X86-NEXT:  # %bb.21: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:  .LBB3_22: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    jne .LBB3_23
+; X86-NEXT:  # %bb.24: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $128, %ecx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    je .LBB3_26
+; X86-NEXT:    jmp .LBB3_27
+; X86-NEXT:  .LBB3_23:
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB3_27
+; X86-NEXT:  .LBB3_26: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:  .LBB3_27: # %_udiv-special-cases
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB3_28
+; X86-NEXT:  # %bb.29: # %select.false.sink
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB3_30: # %select.end
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_32
+; X86-NEXT:  # %bb.31: # %select.end
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:  .LBB3_32: # %select.end
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_38
+; X86-NEXT:  # %bb.33: # %select.end
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $128, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB3_38
+; X86-NEXT:  # %bb.34: # %udiv-bb1
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb $-128, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 216(%esp,%edx), %eax
+; X86-NEXT:    movl 220(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl 212(%esp,%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 208(%esp,%edx), %eax
+; X86-NEXT:    movl 224(%esp,%edx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB3_39
+; X86-NEXT:  # %bb.35: # %udiv-preheader
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 128(%esp,%edx,4), %eax
+; X86-NEXT:    movl 124(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl 120(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 112(%esp,%edx,4), %eax
+; X86-NEXT:    movl 116(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $1, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB3_36: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    leal (%eax,%edx,2), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jne .LBB3_36
+; X86-NEXT:  .LBB3_37: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal (%edx,%ecx,2), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB3_38: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movb %cl, 16(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB3_28:
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    jmp .LBB3_30
+; X86-NEXT:  .LBB3_39:
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB3_37
 ;
 ; X64-LABEL: sdiv129:
-; X64-NOT:     call
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $224, %rsp
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movl %r9d, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    xorq %rcx, %rdi
+; X64-NEXT:    subq %rcx, %rdi
+; X64-NEXT:    sbbq %rcx, %rsi
+; X64-NEXT:    sbbq %rcx, %rdx
+; X64-NEXT:    movl %edx, %r12d
+; X64-NEXT:    andl $1, %r12d
+; X64-NEXT:    xorq %rax, %r9
+; X64-NEXT:    xorq %rax, %r8
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    subq %rax, %r14
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    sbbq %rax, %r9
+; X64-NEXT:    movl %r9d, %r10d
+; X64-NEXT:    andl $1, %r10d
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r10, %rax
+; X64-NEXT:    orq %r8, %rax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %r12, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    shldq $63, %r8, %r9
+; X64-NEXT:    bsrq %r9, %rcx
+; X64-NEXT:    xorq $63, %rcx
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    shldq $63, %r14, %r11
+; X64-NEXT:    bsrq %r11, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmovneq %rcx, %r10
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    shlq $63, %rcx
+; X64-NEXT:    bsrq %rcx, %r13
+; X64-NEXT:    xorq $63, %r13
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    movl $128, %ecx
+; X64-NEXT:    cmoveq %rcx, %r13
+; X64-NEXT:    subq $-128, %r13
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    cmovneq %r10, %r13
+; X64-NEXT:    shldq $63, %rsi, %rdx
+; X64-NEXT:    bsrq %rdx, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    shldq $63, %rdi, %r9
+; X64-NEXT:    bsrq %r9, %r10
+; X64-NEXT:    xorq $63, %r10
+; X64-NEXT:    orq $64, %r10
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %r11, %r10
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    shlq $63, %rbx
+; X64-NEXT:    bsrq %rbx, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    cmoveq %rcx, %r11
+; X64-NEXT:    subq $-128, %r11
+; X64-NEXT:    orq %rdx, %r9
+; X64-NEXT:    cmovneq %r10, %r11
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    subq %r11, %r13
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    sbbq %r15, %r15
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB3_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r15, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r9, %rcx
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB3_3: # %select.end
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    cmovneq %rcx, %rdx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    cmoveq %r12, %rcx
+; X64-NEXT:    jne .LBB3_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r13, %r8
+; X64-NEXT:    xorq $128, %r8
+; X64-NEXT:    orq %r9, %r8
+; X64-NEXT:    orq %r15, %r8
+; X64-NEXT:    je .LBB3_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    movq %r13, %rdx
+; X64-NEXT:    addq $1, %rdx
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb $-128, %cl
+; X64-NEXT:    subb %r13b, %cl
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $3, %al
+; X64-NEXT:    andb $24, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    movsbq %al, %r8
+; X64-NEXT:    movq 160(%rsp,%r8), %rax
+; X64-NEXT:    movq 168(%rsp,%r8), %rbx
+; X64-NEXT:    movq 176(%rsp,%r8), %r14
+; X64-NEXT:    shldq %cl, %rbx, %r14
+; X64-NEXT:    shldq %cl, %rax, %rbx
+; X64-NEXT:    shlq %cl, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    orq %r15, %rcx
+; X64-NEXT:    je .LBB3_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    andl $1, %r14d
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movq 80(%rsp,%rcx,8), %rdi
+; X64-NEXT:    movq 64(%rsp,%rcx,8), %rsi
+; X64-NEXT:    movq 72(%rsp,%rcx,8), %r10
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shrdq %cl, %rdi, %r8
+; X64-NEXT:    shrdq %cl, %r10, %rsi
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq $-1, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    adcq $1, %r12
+; X64-NEXT:    andl $1, %r12d
+; X64-NEXT:    xorl %r11d, %r11d
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %r13d, %r13d
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB3_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %rsi, %r10
+; X64-NEXT:    shrq $63, %r8
+; X64-NEXT:    andl $1, %r14d
+; X64-NEXT:    leaq (%r14,%rsi,2), %rsi
+; X64-NEXT:    shldq $1, %rax, %rcx
+; X64-NEXT:    orq %rdi, %rcx
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    addq %rax, %rax
+; X64-NEXT:    orq %r11, %rax
+; X64-NEXT:    orl %ebx, %r13d
+; X64-NEXT:    movl %r13d, %r14d
+; X64-NEXT:    andl $1, %r14d
+; X64-NEXT:    cmpq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    sbbq %r10, %rdi
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    sbbq %r8, %rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    movl %edi, %r11d
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    sbbq %r8, %r10
+; X64-NEXT:    addq $-1, %rdx
+; X64-NEXT:    adcq $-1, %r15
+; X64-NEXT:    adcq $1, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    orq %r9, %rdi
+; X64-NEXT:    orq %r15, %rdi
+; X64-NEXT:    movl $0, %edi
+; X64-NEXT:    movl $0, %r13d
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    jne .LBB3_7
+; X64-NEXT:  .LBB3_8: # %udiv-loop-exit
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    shldq $1, %rax, %rdx
+; X64-NEXT:    shrq $63, %rcx
+; X64-NEXT:    leaq (%r11,%rax,2), %rax
+; X64-NEXT:  .LBB3_9: # %udiv-end
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    xorq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    xorq %rsi, %rdx
+; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    sbbq %rdi, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB3_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB3_3
+; X64-NEXT:  .LBB3_10:
+; X64-NEXT:    xorl %r11d, %r11d
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    jmp .LBB3_8
   %res = sdiv i129 %a, %b
   ret i129 %res
 }
 
 define i129 @srem129(i129 %a, i129 %b) nounwind {
 ; X86-LABEL: srem129:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $288, %esp # imm = 0x120
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    negl %eax
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 44(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    shldl $31, %ebx, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl $31, %edi, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    bsrl %edx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB4_3: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $31, %edx, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB4_6
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB4_6: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB4_8: # %_udiv-special-cases
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    orb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    je .LBB4_11
+; X86-NEXT:  .LBB4_12: # %_udiv-special-cases
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    jne .LBB4_14
+; X86-NEXT:  .LBB4_13: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB4_14: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_15
+; X86-NEXT:  # %bb.16: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    jmp .LBB4_17
+; X86-NEXT:  .LBB4_9: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %ecx
+; X86-NEXT:    jne .LBB4_12
+; X86-NEXT:  .LBB4_11: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %ecx
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    je .LBB4_13
+; X86-NEXT:    jmp .LBB4_14
+; X86-NEXT:  .LBB4_15:
+; X86-NEXT:    bsrl %ecx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:  .LBB4_17: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $31, %ecx, %ebx
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB4_18
+; X86-NEXT:  # %bb.19: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB4_20
+; X86-NEXT:  .LBB4_18:
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB4_20: # %_udiv-special-cases
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_22
+; X86-NEXT:  # %bb.21: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB4_22: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    jne .LBB4_23
+; X86-NEXT:  # %bb.24: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    je .LBB4_26
+; X86-NEXT:    jmp .LBB4_27
+; X86-NEXT:  .LBB4_23:
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    jne .LBB4_27
+; X86-NEXT:  .LBB4_26: # %_udiv-special-cases
+; X86-NEXT:    subl $-128, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB4_27: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_28
+; X86-NEXT:  # %bb.29: # %select.false.sink
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB4_30: # %select.end
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    jne .LBB4_32
+; X86-NEXT:  # %bb.31: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:  .LBB4_32: # %select.end
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_38
+; X86-NEXT:  # %bb.33: # %select.end
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorl $128, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB4_38
+; X86-NEXT:  # %bb.34: # %udiv-bb1
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb $-128, %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 248(%esp,%eax), %esi
+; X86-NEXT:    movl 252(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl 244(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 240(%esp,%eax), %esi
+; X86-NEXT:    movl 256(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    je .LBB4_39
+; X86-NEXT:  # %bb.35: # %udiv-preheader
+; X86-NEXT:    andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $5, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 160(%esp,%eax,4), %edx
+; X86-NEXT:    movl 156(%esp,%eax,4), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl 152(%esp,%eax,4), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%eax,4), %ebx
+; X86-NEXT:    movl 148(%esp,%eax,4), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB4_36: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    leal (%ebx,%eax,2), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $1, %edi
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    jne .LBB4_36
+; X86-NEXT:  .LBB4_37: # %udiv-loop-exit
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB4_38: # %udiv-end
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %esi, 8(%edx)
+; X86-NEXT:    movl %ecx, 12(%edx)
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movb %al, 16(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB4_28:
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    jmp .LBB4_30
+; X86-NEXT:  .LBB4_39:
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB4_37
 ;
 ; X64-LABEL: srem129:
-; X64-NOT:     call
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movl %r9d, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    movl %edx, %r15d
+; X64-NEXT:    andl $1, %r15d
+; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    negq %r12
+; X64-NEXT:    xorq %r12, %rdx
+; X64-NEXT:    xorq %r12, %r10
+; X64-NEXT:    xorq %r12, %r11
+; X64-NEXT:    subq %r12, %r11
+; X64-NEXT:    sbbq %r12, %r10
+; X64-NEXT:    sbbq %r12, %rdx
+; X64-NEXT:    movl %edx, %r13d
+; X64-NEXT:    andl $1, %r13d
+; X64-NEXT:    xorq %rax, %r9
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    subq %rax, %rcx
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    sbbq %rax, %r9
+; X64-NEXT:    movl %r9d, %esi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    orq %r14, %rax
+; X64-NEXT:    sete %sil
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    orq %r13, %rax
+; X64-NEXT:    orq %r10, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    shldq $63, %r14, %r9
+; X64-NEXT:    bsrq %r9, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    shldq $63, %rcx, %rsi
+; X64-NEXT:    bsrq %rsi, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    orq $64, %rdi
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmovneq %r8, %rdi
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shlq $63, %rcx
+; X64-NEXT:    bsrq %rcx, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    movl $128, %ecx
+; X64-NEXT:    cmoveq %rcx, %rbx
+; X64-NEXT:    subq $-128, %rbx
+; X64-NEXT:    orq %r9, %rsi
+; X64-NEXT:    cmovneq %rdi, %rbx
+; X64-NEXT:    shldq $63, %r10, %rdx
+; X64-NEXT:    bsrq %rdx, %rsi
+; X64-NEXT:    xorq $63, %rsi
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    shldq $63, %r11, %rdi
+; X64-NEXT:    bsrq %rdi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    orq $64, %r8
+; X64-NEXT:    testq %rdx, %rdx
+; X64-NEXT:    cmovneq %rsi, %r8
+; X64-NEXT:    movq %r11, %rsi
+; X64-NEXT:    shlq $63, %rsi
+; X64-NEXT:    bsrq %rsi, %r9
+; X64-NEXT:    xorq $63, %r9
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmoveq %rcx, %r9
+; X64-NEXT:    subq $-128, %r9
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    cmovneq %r8, %r9
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    subq %r9, %rbx
+; X64-NEXT:    movl $0, %r9d
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    sbbq %rdx, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rbx, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %r9, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB4_3: # %select.end
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    cmovneq %rsi, %r8
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    cmovneq %rsi, %rdi
+; X64-NEXT:    cmoveq %r13, %rsi
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jne .LBB4_9
+; X64-NEXT:  # %bb.4: # %select.end
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    xorq $128, %rax
+; X64-NEXT:    orq %rdx, %rax
+; X64-NEXT:    orq %r9, %rax
+; X64-NEXT:    je .LBB4_9
+; X64-NEXT:  # %bb.5: # %udiv-bb1
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    addq $1, %rax
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb $-128, %cl
+; X64-NEXT:    subb %bl, %cl
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    andb $24, %sil
+; X64-NEXT:    negb %sil
+; X64-NEXT:    movsbq %sil, %rsi
+; X64-NEXT:    movq 192(%rsp,%rsi), %rbx
+; X64-NEXT:    movq 200(%rsp,%rsi), %r15
+; X64-NEXT:    movq 208(%rsp,%rsi), %r8
+; X64-NEXT:    shldq %cl, %r15, %r8
+; X64-NEXT:    shldq %cl, %rbx, %r15
+; X64-NEXT:    shlq %cl, %rbx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    je .LBB4_10
+; X64-NEXT:  # %bb.6: # %udiv-preheader
+; X64-NEXT:    andl $1, %r8d
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrb $6, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movq 112(%rsp,%rcx,8), %rsi
+; X64-NEXT:    movq 96(%rsp,%rcx,8), %r13
+; X64-NEXT:    movq 104(%rsp,%rcx,8), %rdi
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NEXT:    shrdq %cl, %rdi, %r13
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    adcq $1, %r11
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    movq %r10, %r12
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB4_7: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %r13, %r12
+; X64-NEXT:    shrq $63, %r10
+; X64-NEXT:    andl $1, %r8d
+; X64-NEXT:    leaq (%r8,%r13,2), %r13
+; X64-NEXT:    shldq $1, %rbx, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    shrq $63, %r15
+; X64-NEXT:    addq %rbx, %rbx
+; X64-NEXT:    orq %rdi, %rbx
+; X64-NEXT:    orl %r15d, %r14d
+; X64-NEXT:    movl %r14d, %r8d
+; X64-NEXT:    andl $1, %r8d
+; X64-NEXT:    cmpq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %r12, %rcx
+; X64-NEXT:    movq %r11, %rcx
+; X64-NEXT:    sbbq %r10, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    movl %ecx, %edi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    subq %rcx, %r13
+; X64-NEXT:    sbbq %r10, %r12
+; X64-NEXT:    addq $-1, %rax
+; X64-NEXT:    adcq $-1, %r9
+; X64-NEXT:    adcq $1, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    movl $0, %r14d
+; X64-NEXT:    movq %r12, %r10
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    jne .LBB4_7
+; X64-NEXT:  .LBB4_8: # %udiv-loop-exit
+; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    shldq $1, %rbx, %r8
+; X64-NEXT:    shrq $63, %rsi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:  .LBB4_9: # %udiv-end
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %r14, %r13
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    imulq %r8, %r13
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    imulq %rdi, %r8
+; X64-NEXT:    imulq %r15, %rsi
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    subq %rcx, %r11
+; X64-NEXT:    sbbq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rsi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    xorq %r12, %r10
+; X64-NEXT:    xorq %r12, %r11
+; X64-NEXT:    subq %r12, %r11
+; X64-NEXT:    sbbq %r12, %r10
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r10, %rdx
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB4_3
+; X64-NEXT:  .LBB4_10:
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    jmp .LBB4_8
   %res = srem i129 %a, %b
   ret i129 %res
 }
 
-; Some higher sizes
+; i257 is also expanded inline (wider than MaxDivRemBitWidthSupported=128).
+; The inline expansion uses i256 shifts which are expanded via ExpandToParts.
 define i257 @sdiv257(i257 %a, i257 %b) nounwind {
 ; X86-LABEL: sdiv257:
-; X86-NOT:     call
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $480, %esp # imm = 0x1E0
+; X86-NEXT:    movl 80(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 76(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ebp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl 64(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shldl $31, %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_3
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_3: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $31, %edx, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_6
+; X86-NEXT:  .LBB5_4:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_6: # %_udiv-special-cases
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB5_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_8: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $31, %edx, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_9
+; X86-NEXT:  # %bb.10: # %_udiv-special-cases
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    jmp .LBB5_11
+; X86-NEXT:  .LBB5_9:
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:  .LBB5_11: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shldl $31, %ebx, %edx
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB5_12
+; X86-NEXT:  # %bb.13: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_14
+; X86-NEXT:  .LBB5_12:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_14: # %_udiv-special-cases
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB5_16
+; X86-NEXT:  # %bb.15: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:  .LBB5_16: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    jne .LBB5_18
+; X86-NEXT:  # %bb.17: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_18: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB5_20
+; X86-NEXT:  # %bb.19: # %_udiv-special-cases
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:  .LBB5_20: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    je .LBB5_21
+; X86-NEXT:  # %bb.22: # %_udiv-special-cases
+; X86-NEXT:    je .LBB5_23
+; X86-NEXT:  .LBB5_24: # %_udiv-special-cases
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB5_26
+; X86-NEXT:  .LBB5_25: # %_udiv-special-cases
+; X86-NEXT:    addl $256, %eax # imm = 0x100
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_26: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $31, %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_27
+; X86-NEXT:  # %bb.28: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    jmp .LBB5_29
+; X86-NEXT:  .LBB5_21: # %_udiv-special-cases
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    jne .LBB5_24
+; X86-NEXT:  .LBB5_23: # %_udiv-special-cases
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    je .LBB5_25
+; X86-NEXT:    jmp .LBB5_26
+; X86-NEXT:  .LBB5_27:
+; X86-NEXT:    bsrl %edx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:  .LBB5_29: # %_udiv-special-cases
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    shldl $31, %ebx, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_30
+; X86-NEXT:  # %bb.31: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_32
+; X86-NEXT:  .LBB5_30:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_32: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB5_34
+; X86-NEXT:  # %bb.33: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_34: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $31, %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $31, %edi, %ecx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_35
+; X86-NEXT:  # %bb.36: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ecx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB5_37
+; X86-NEXT:  .LBB5_35:
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB5_37: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_38
+; X86-NEXT:  # %bb.39: # %_udiv-special-cases
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB5_40
+; X86-NEXT:  .LBB5_38:
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB5_40: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    jne .LBB5_42
+; X86-NEXT:  # %bb.41: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:  .LBB5_42: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB5_44
+; X86-NEXT:  # %bb.43: # %_udiv-special-cases
+; X86-NEXT:    orl $128, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:  .LBB5_44: # %_udiv-special-cases
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    jne .LBB5_45
+; X86-NEXT:  # %bb.46: # %_udiv-special-cases
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $256, %ecx # imm = 0x100
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    je .LBB5_48
+; X86-NEXT:    jmp .LBB5_49
+; X86-NEXT:  .LBB5_45:
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    jne .LBB5_49
+; X86-NEXT:  .LBB5_48: # %_udiv-special-cases
+; X86-NEXT:    addl $256, %ecx # imm = 0x100
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:  .LBB5_49: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB5_50
+; X86-NEXT:  # %bb.51: # %select.false.sink
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB5_52: # %select.end
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB5_54
+; X86-NEXT:  # %bb.53: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB5_54: # %select.end
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jne .LBB5_55
+; X86-NEXT:  # %bb.61: # %select.end
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl $256, %edi # imm = 0x100
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    je .LBB5_62
+; X86-NEXT:  # %bb.59: # %udiv-bb1
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $256, %edx # imm = 0x100
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebx), %esi
+; X86-NEXT:    movl 28(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl 20(%ebx), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebx), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 404(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    je .LBB5_60
+; X86-NEXT:  # %bb.56: # %udiv-preheader
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl 240(%esp,%eax), %esi
+; X86-NEXT:    movl 236(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 232(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 228(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 224(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 220(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 216(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 208(%esp,%eax), %edi
+; X86-NEXT:    movl 212(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $1, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB5_57: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    leal (%esi,%eax,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $1, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB5_57
+; X86-NEXT:  .LBB5_58: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    leal (%edi,%esi,2), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:  .LBB5_62: # %udiv-end
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    movb %dl, 32(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB5_50:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jmp .LBB5_52
+; X86-NEXT:  .LBB5_60:
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB5_58
+; X86-NEXT:  .LBB5_55:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB5_62
 ;
 ; X64-LABEL: sdiv257:
-; X64-NOT:     call
+; X64:       # %bb.0: # %_udiv-special-cases
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $480, %rsp # imm = 0x1E0
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 48(%rbp), %rax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    movl %r9d, %r12d
+; X64-NEXT:    andl $1, %r12d
+; X64-NEXT:    negq %r12
+; X64-NEXT:    xorq %r12, %r9
+; X64-NEXT:    xorq %r12, %r8
+; X64-NEXT:    xorq %r12, %r15
+; X64-NEXT:    xorq %r12, %r13
+; X64-NEXT:    xorq %r12, %rsi
+; X64-NEXT:    subq %r12, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    sbbq %r12, %r13
+; X64-NEXT:    sbbq %r12, %r15
+; X64-NEXT:    sbbq %r12, %r8
+; X64-NEXT:    sbbq %r12, %r9
+; X64-NEXT:    movl %r9d, %edi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    xorq %rcx, %rax
+; X64-NEXT:    movq 40(%rbp), %r14
+; X64-NEXT:    xorq %rcx, %r14
+; X64-NEXT:    movq 32(%rbp), %r11
+; X64-NEXT:    xorq %rcx, %r11
+; X64-NEXT:    movq 24(%rbp), %r10
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    movq 16(%rbp), %rbx
+; X64-NEXT:    xorq %rcx, %rbx
+; X64-NEXT:    subq %rcx, %rbx
+; X64-NEXT:    sbbq %rcx, %r10
+; X64-NEXT:    sbbq %rcx, %r11
+; X64-NEXT:    sbbq %rcx, %r14
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    xorq %rcx, %r12
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    movl %r12d, %ecx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    orq %r14, %rcx
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    orq %r11, %rdx
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rsi, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    sete %dl
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    orq %r8, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    orq %r15, %rsi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rdi, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    orb %dl, %cl
+; X64-NEXT:    shldq $63, %r14, %rax
+; X64-NEXT:    bsrq %rax, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    shldq $63, %r11, %rdx
+; X64-NEXT:    bsrq %rdx, %rsi
+; X64-NEXT:    xorq $63, %rsi
+; X64-NEXT:    orq $64, %rsi
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    cmovneq %rdi, %rsi
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    shldq $63, %r10, %rdi
+; X64-NEXT:    bsrq %rdi, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shldq $63, %rbx, %r10
+; X64-NEXT:    bsrq %r10, %r14
+; X64-NEXT:    xorq $63, %r14
+; X64-NEXT:    orq $64, %r14
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %r11, %r14
+; X64-NEXT:    orq $128, %r14
+; X64-NEXT:    orq %rdx, %r10
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    cmovneq %rsi, %r14
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    shlq $63, %rax
+; X64-NEXT:    bsrq %rax, %rsi
+; X64-NEXT:    xorq $63, %rsi
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    movl $128, %edx
+; X64-NEXT:    cmoveq %rdx, %rsi
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    cmoveq %rax, %rsi
+; X64-NEXT:    addq $256, %rsi # imm = 0x100
+; X64-NEXT:    orq %r10, %rdi
+; X64-NEXT:    cmovneq %r14, %rsi
+; X64-NEXT:    shldq $63, %r8, %r9
+; X64-NEXT:    bsrq %r9, %rdi
+; X64-NEXT:    xorq $63, %rdi
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    shldq $63, %r15, %r11
+; X64-NEXT:    bsrq %r11, %rbx
+; X64-NEXT:    xorq $63, %rbx
+; X64-NEXT:    orq $64, %rbx
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmovneq %rdi, %rbx
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    shldq $63, %r13, %rdi
+; X64-NEXT:    bsrq %rdi, %r14
+; X64-NEXT:    xorq $63, %r14
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    shldq $63, %r15, %r13
+; X64-NEXT:    bsrq %r13, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    orq $64, %r8
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %r14, %r8
+; X64-NEXT:    orq $128, %r8
+; X64-NEXT:    orq %r11, %r13
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    cmovneq %rbx, %r8
+; X64-NEXT:    movq %r15, %rbx
+; X64-NEXT:    orq %r9, %rdi
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    shlq $63, %r9
+; X64-NEXT:    bsrq %r9, %r11
+; X64-NEXT:    xorq $63, %r11
+; X64-NEXT:    testq %r9, %r9
+; X64-NEXT:    cmoveq %rdx, %r11
+; X64-NEXT:    cmoveq %rax, %r11
+; X64-NEXT:    addq $256, %r11 # imm = 0x100
+; X64-NEXT:    orq %r13, %rdi
+; X64-NEXT:    cmovneq %r8, %r11
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    subq %r11, %rsi
+; X64-NEXT:    movl $0, %r10d
+; X64-NEXT:    sbbq %r10, %r10
+; X64-NEXT:    movl $0, %r15d
+; X64-NEXT:    sbbq %r15, %r15
+; X64-NEXT:    movl $0, %r13d
+; X64-NEXT:    sbbq %r13, %r13
+; X64-NEXT:    sbbq %r9, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # %bb.2: # %select.false.sink
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    cmpq %rsi, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %r10, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %r15, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %r13, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %r9, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    sbbq %rcx, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:  .LBB5_3: # %select.end
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    cmovneq %rdi, %r11
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    cmovneq %rdi, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    cmovneq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    cmovneq %rdi, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    cmoveq %rbx, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    jne .LBB5_4
+; X64-NEXT:  # %bb.10: # %select.end
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    orq %r13, %rdi
+; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    xorq $256, %r8 # imm = 0x100
+; X64-NEXT:    orq %r15, %r8
+; X64-NEXT:    orq %r9, %r8
+; X64-NEXT:    orq %rdi, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    je .LBB5_11
+; X64-NEXT:  # %bb.8: # %udiv-bb1
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    addq $1, %r14
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $56, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    movq 400(%rsp,%rax), %rdi
+; X64-NEXT:    movq 408(%rsp,%rax), %rbx
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    shldq %cl, %rdi, %rdx
+; X64-NEXT:    movq 384(%rsp,%rax), %r11
+; X64-NEXT:    movq 392(%rsp,%rax), %r13
+; X64-NEXT:    shldq %cl, %r13, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 416(%rsp,%rax), %rdi
+; X64-NEXT:    shldq %cl, %rbx, %rdi
+; X64-NEXT:    shldq %cl, %r11, %r13
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shlq %cl, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %rsi, %r10
+; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r15, %rcx
+; X64-NEXT:    orq %r9, %rcx
+; X64-NEXT:    orq %r10, %rcx
+; X64-NEXT:    je .LBB5_9
+; X64-NEXT:  # %bb.5: # %udiv-preheader
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r14d, %ecx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl %r14d, %r10d
+; X64-NEXT:    shrl $3, %r10d
+; X64-NEXT:    andl $56, %r10d
+; X64-NEXT:    movq 224(%rsp,%r10), %r8
+; X64-NEXT:    movq 216(%rsp,%r10), %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    shrdq %cl, %r8, %rax
+; X64-NEXT:    movq 208(%rsp,%r10), %r8
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    shrdq %cl, %r9, %r14
+; X64-NEXT:    movq 192(%rsp,%r10), %r15
+; X64-NEXT:    movq 200(%rsp,%r10), %r9
+; X64-NEXT:    movq %r9, %r12
+; X64-NEXT:    shrdq %cl, %r8, %r12
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdq %cl, %r9, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq $1, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB5_6: # %udiv-do-while
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    shldq $1, %r14, %r11
+; X64-NEXT:    shldq $1, %r12, %r14
+; X64-NEXT:    shldq $1, %r15, %r12
+; X64-NEXT:    shrq $63, %rax
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leaq (%rdi,%r15,2), %r15
+; X64-NEXT:    shldq $1, %rcx, %r8
+; X64-NEXT:    orq %r9, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    shldq $1, %r13, %rcx
+; X64-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    shldq $1, %r10, %r13
+; X64-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    shrq $63, %rdx
+; X64-NEXT:    addq %r10, %r10
+; X64-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    orl %edx, %ebx
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    cmpq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %r12, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %r14, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %r11, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rdx
+; X64-NEXT:    andl $1, %edx
+; X64-NEXT:    negq %rdx
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    subq %rdx, %r15
+; X64-NEXT:    sbbq %r9, %r12
+; X64-NEXT:    sbbq %r8, %r14
+; X64-NEXT:    sbbq %rax, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    addq $-1, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    adcq $-1, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    adcq $-1, %r9
+; X64-NEXT:    adcq $1, %rsi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r9, %rax
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    orq %r8, %rdx
+; X64-NEXT:    orq %rsi, %rdx
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl $0, %r9d
+; X64-NEXT:    movl $0, %ebx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    jne .LBB5_6
+; X64-NEXT:  .LBB5_7: # %udiv-loop-exit
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    shldq $1, %rcx, %r14
+; X64-NEXT:    shldq $1, %r13, %rcx
+; X64-NEXT:    shldq $1, %r10, %r13
+; X64-NEXT:    shrq $63, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    leaq (%rax,%r10,2), %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r13, %r11
+; X64-NEXT:  .LBB5_11: # %udiv-end
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    xorq %rsi, %r8
+; X64-NEXT:    xorq %r12, %r14
+; X64-NEXT:    xorq %r12, %rcx
+; X64-NEXT:    xorq %r12, %r11
+; X64-NEXT:    xorq %r12, %rdx
+; X64-NEXT:    subq %r12, %rdx
+; X64-NEXT:    sbbq %r12, %r11
+; X64-NEXT:    sbbq %r12, %rcx
+; X64-NEXT:    sbbq %r12, %r14
+; X64-NEXT:    sbbq %rsi, %r8
+; X64-NEXT:    movq %rdx, (%rax)
+; X64-NEXT:    movq %r11, 8(%rax)
+; X64-NEXT:    movq %rcx, 16(%rax)
+; X64-NEXT:    movq %r14, 24(%rax)
+; X64-NEXT:    andl $1, %r8d
+; X64-NEXT:    movb %r8b, 32(%rax)
+; X64-NEXT:    leaq -40(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_1:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    jmp .LBB5_3
+; X64-NEXT:  .LBB5_9:
+; X64-NEXT:    movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    jmp .LBB5_7
+; X64-NEXT:  .LBB5_4:
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    jmp .LBB5_11
   %res = sdiv i257 %a, %b
   ret i257 %res
 }
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c..0b567dca3b362 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -514,99 +514,117 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbx, %r15
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r15, %rbx
-; X64-NEXT:    adcq %r14, %rbp
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r10d
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rbp, %r13
-; X64-NEXT:    adcq %r10, %r12
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-NEXT:    movq %r9, %r13
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r15, %r10
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r10, %r15
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    imulq %r9, %r11
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    addq %r13, %r14
-; X64-NEXT:    adcq %r12, %r15
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r10, %rbp
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    imulq %r10, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    addq %r14, %r12
-; X64-NEXT:    adcq %r15, %rax
-; X64-NEXT:    adcq %r11, %rcx
-; X64-NEXT:    imulq %r9, %r8
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movabsq $17592186044415, %r12 # imm = 0xFFFFFFFFFFF
+; X64-NEXT:    andq %r12, %r13
+; X64-NEXT:    andq 48(%rbp), %r12
+; X64-NEXT:    movq 16(%rbp), %r9
+; X64-NEXT:    subq $8, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq 24(%rbp)
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq 16(%rbp), %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq 24(%rbp)
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq 32(%rbp), %r15
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    movq 40(%rbp), %r14
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    addq %rdx, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rbx, 8(%rdi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%rdi)
-; X64-NEXT:    movq %r12, 16(%rdi)
-; X64-NEXT:    movq %rax, 24(%rdi)
-; X64-NEXT:    movl %esi, 32(%rdi)
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq 16(%rbp), %r9
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq 24(%rbp)
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq $0
+; X64-NEXT:    pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT:    callq __multi5 at PLT
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movaps %xmm0, (%rbx)
+; X64-NEXT:    movq %rcx, 16(%rbx)
+; X64-NEXT:    movq %rdx, 24(%rbx)
+; X64-NEXT:    movl %esi, 32(%rbx)
 ; X64-NEXT:    shrq $32, %rsi
 ; X64-NEXT:    andl $4095, %esi # imm = 0xFFF
-; X64-NEXT:    movw %si, 36(%rdi)
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movw %si, 36(%rbx)
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 65b602801b365..c179c220fa16a 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2730,7 +2730,11 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -2738,22 +2742,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%r9), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
@@ -2766,7 +2770,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
@@ -2777,11 +2781,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -2789,20 +2799,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -2811,10 +2821,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -2822,28 +2838,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, (%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r9, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r8, %rax
@@ -2852,10 +2868,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -2863,20 +2885,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rax, %rax
@@ -2884,24 +2906,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%r9), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
@@ -2910,11 +2938,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rbx
@@ -2931,28 +2959,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -2962,31 +2996,37 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %cl, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, -72(%rsp,%rcx), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, (%rsp,%rcx), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rcx), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rcx), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rcx), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rcx), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rcx), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rcx), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rcx,%rcx), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r11
@@ -2999,27 +3039,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -3028,21 +3074,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%r9), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
@@ -3051,11 +3103,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rbx
@@ -3072,26 +3124,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -3101,60 +3159,72 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, -72(%rsp,%rax), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, (%rsp,%rcx), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rcx), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rcx), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r8,%r8), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rax,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rcx), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r8, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -3163,6 +3233,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
@@ -4210,7 +4282,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -4219,22 +4295,22 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%r9,4), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%r9,4), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
@@ -4247,7 +4323,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
@@ -4258,11 +4334,17 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -4271,20 +4353,20 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax,4), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rax
@@ -4292,10 +4374,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -4304,28 +4392,28 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rsi,4), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, (%rsp,%rsi,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rsi,4), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r9, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r8, %rax
@@ -4334,10 +4422,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -4346,20 +4440,20 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax,4), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rax, %rax
@@ -4367,25 +4461,31 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $6, %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%r9,4), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
@@ -4394,11 +4494,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%r9,4), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rbx
@@ -4415,29 +4515,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $6, %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -4446,32 +4552,38 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $6, %cl
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %cl, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, -72(%rsp,%rcx,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, (%rsp,%rcx,4), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rcx,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rcx,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rcx,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rcx,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rcx,4), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rcx,%rcx), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r11
@@ -4484,28 +4596,34 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $6, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -4514,22 +4632,28 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $6, %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%r9,4), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
@@ -4538,11 +4662,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%r9,4), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rbx
@@ -4559,27 +4683,33 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $6, %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -4588,62 +4718,74 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $6, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, -72(%rsp,%rax,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $6, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, (%rsp,%rcx,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rcx,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rcx,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r8,%r8), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax,4), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rax,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rcx,4), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r8, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $6, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -4652,6 +4794,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
@@ -4769,58 +4913,76 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $96, %rsp
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
 ; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-SSE2-NEXT:    andl $3, %esi
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT:    movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq 16(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rax, (%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    movq %rbp, %rsp
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $96, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movzbl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SSE42-NEXT:    andl $3, %eax
-; X64-SSE42-NEXT:    movups -72(%rsp,%rax,8), %xmm0
-; X64-SSE42-NEXT:    movups -56(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT:    movups (%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rsp,%rax,8), %xmm1
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: lshr_32bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbp
+; X64-AVX-NEXT:    movq %rsp, %rbp
+; X64-AVX-NEXT:    andq $-32, %rsp
+; X64-AVX-NEXT:    subq $96, %rsp
 ; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-AVX-NEXT:    andl $3, %eax
-; X64-AVX-NEXT:    vmovups -72(%rsp,%rax,8), %xmm0
-; X64-AVX-NEXT:    vmovups -56(%rsp,%rax,8), %xmm1
+; X64-AVX-NEXT:    vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX-NEXT:    vmovups 16(%rsp,%rax,8), %xmm1
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    movq %rbp, %rsp
+; X64-AVX-NEXT:    popq %rbp
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
@@ -4938,7 +5100,11 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -4946,17 +5112,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbq %sil, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%r10), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%r10), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
@@ -4967,10 +5133,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r11, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%r10), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%r10), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
@@ -4988,11 +5154,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -5000,20 +5172,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbq %sil, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r8, %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -5022,10 +5194,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -5033,26 +5211,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbq %sil, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%rdi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rdi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rsi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %rdi, %rdi
@@ -5064,10 +5242,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -5075,20 +5259,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbq %sil, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r8, %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %rcx
@@ -5096,28 +5280,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    negb %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%r8), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%r8), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -5125,8 +5315,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%r8), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -5145,27 +5335,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negb %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
@@ -5175,32 +5371,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, 56(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rsi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %rsi, %rsi
@@ -5213,27 +5415,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -5242,25 +5450,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    negb %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%r8), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%r8), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
@@ -5268,8 +5482,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%r8), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
@@ -5288,25 +5502,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    negb %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r9
@@ -5316,30 +5536,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, 56(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rsi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrq %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rax, %rsi, %rsi
@@ -5352,25 +5578,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    negb %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -5379,6 +5611,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
@@ -6436,7 +6670,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -6445,18 +6683,18 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $2, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbq %sil, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%r10), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%r10), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
@@ -6467,10 +6705,10 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r11, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%r10), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%r10), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
@@ -6488,11 +6726,17 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -6501,21 +6745,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $2, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbq %sil, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r8, %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
@@ -6523,10 +6767,16 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -6535,27 +6785,27 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $2, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbq %sil, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%rdi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rdi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rsi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %rdi, %rdi
@@ -6567,10 +6817,16 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -6579,21 +6835,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $2, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbq %sil, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r8, %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %rcx
@@ -6601,30 +6857,36 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $2, %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    negb %cl
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%r8), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%r8), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -6632,8 +6894,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%r8), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -6652,29 +6914,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $2, %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negb %al
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
@@ -6683,34 +6951,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $2, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, 56(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rsi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %rsi, %rsi
@@ -6723,29 +6997,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $2, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %al
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -6754,27 +7034,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $2, %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    negb %cl
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%r8), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%r8), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
@@ -6782,8 +7068,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%r8), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
@@ -6802,27 +7088,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $2, %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    negb %al
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r9
@@ -6831,32 +7123,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %al
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $2, %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    negb %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, 56(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rsi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrq %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rax, %rsi, %rsi
@@ -6869,27 +7167,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %cl
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $2, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    negb %al
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -6898,6 +7202,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
@@ -7024,67 +7330,85 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: shl_32bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $96, %rsp
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
 ; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    shlb $3, %sil
 ; X64-SSE2-NEXT:    andb $24, %sil
 ; X64-SSE2-NEXT:    negb %sil
 ; X64-SSE2-NEXT:    movsbq %sil, %rax
-; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rcx
-; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rsi
-; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
-; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq 32(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq 40(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq 56(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq 48(%rsp,%rax), %rax
 ; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, (%rdx)
 ; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    movq %rbp, %rsp
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: shl_32bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $96, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movzbl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, (%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    shlb $3, %al
 ; X64-SSE42-NEXT:    andb $24, %al
 ; X64-SSE42-NEXT:    negb %al
 ; X64-SSE42-NEXT:    movsbq %al, %rax
-; X64-SSE42-NEXT:    movups -40(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -24(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups 48(%rsp,%rax), %xmm1
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: shl_32bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbp
+; X64-AVX-NEXT:    movq %rsp, %rbp
+; X64-AVX-NEXT:    andq $-32, %rsp
+; X64-AVX-NEXT:    subq $96, %rsp
 ; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %ymm1, (%rsp)
+; X64-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    shlb $3, %al
 ; X64-AVX-NEXT:    andb $24, %al
 ; X64-AVX-NEXT:    negb %al
 ; X64-AVX-NEXT:    movsbq %al, %rax
-; X64-AVX-NEXT:    vmovups -40(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -24(%rsp,%rax), %xmm1
+; X64-AVX-NEXT:    vmovups 32(%rsp,%rax), %xmm0
+; X64-AVX-NEXT:    vmovups 48(%rsp,%rax), %xmm1
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    movq %rbp, %rsp
+; X64-AVX-NEXT:    popq %rbp
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
@@ -7211,32 +7535,36 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%r9), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
@@ -7249,7 +7577,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
@@ -7260,34 +7588,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -7296,41 +7630,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, (%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r9, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r8, %rax
@@ -7339,33 +7679,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxq %rcx, %rax, %rax
@@ -7373,28 +7719,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%r9), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
@@ -7403,11 +7755,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rbx
@@ -7424,32 +7776,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -7459,35 +7817,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, (%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rsi,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r11
@@ -7500,31 +7864,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -7533,28 +7903,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%r9), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
@@ -7563,11 +7939,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rbx
@@ -7584,32 +7960,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -7619,35 +8001,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, (%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rsi,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r11
@@ -7660,31 +8048,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $24, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -7693,6 +8087,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
@@ -8870,7 +9266,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -8878,25 +9278,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%r9,4), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%r9,4), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
@@ -8909,7 +9309,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
@@ -8920,11 +9320,17 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -8932,23 +9338,23 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax,4), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarq %cl, %rax
@@ -8956,10 +9362,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -8967,31 +9379,31 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rsi,4), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, (%rsp,%rsi,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rsi,4), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rax, %r9, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r8, %rax
@@ -9000,10 +9412,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -9011,23 +9429,23 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax,4), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxq %rcx, %rax, %rax
@@ -9035,29 +9453,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%r9,4), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
@@ -9066,11 +9490,11 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%r9,4), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rbx
@@ -9087,33 +9511,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -9122,36 +9552,42 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, (%rsp,%rsi,4), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rsi,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rsi,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rsi,4), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rsi,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rdi, %r11
@@ -9164,32 +9600,38 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r9
@@ -9198,29 +9640,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%r9,4), %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
@@ -9229,11 +9677,11 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%r9,4), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%r9,4), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rbx
@@ -9250,33 +9698,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -9285,36 +9739,42 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $6, %sil
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, (%rsp,%rsi,4), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rsi,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rsi,4), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rsi,4), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rsi,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rdi, %r11
@@ -9327,32 +9787,38 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $96, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $6, %sil
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax,4), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax,4), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r9
@@ -9361,6 +9827,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
@@ -9521,71 +9989,89 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $96, %rsp
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
 ; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-SSE2-NEXT:    movzbl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-SSE2-NEXT:    sarq $63, %rdi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    andl $3, %esi
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT:    movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq 16(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rax, (%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    movq %rbp, %rsp
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $96, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movq 16(%rdi), %rax
 ; X64-SSE42-NEXT:    movq 24(%rdi), %rcx
 ; X64-SSE42-NEXT:    movzbl (%rsi), %esi
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SSE42-NEXT:    sarq $63, %rcx
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    andl $3, %esi
-; X64-SSE42-NEXT:    movups -72(%rsp,%rsi,8), %xmm0
-; X64-SSE42-NEXT:    movups -56(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT:    movups (%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rsp,%rsi,8), %xmm1
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: ashr_32bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbp
+; X64-AVX-NEXT:    movq %rsp, %rbp
+; X64-AVX-NEXT:    andq $-32, %rsp
+; X64-AVX-NEXT:    subq $96, %rsp
 ; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-AVX-NEXT:    movq 16(%rdi), %rax
 ; X64-AVX-NEXT:    movq 24(%rdi), %rcx
 ; X64-AVX-NEXT:    movzbl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rsp)
 ; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    andl $3, %esi
-; X64-AVX-NEXT:    vmovups -72(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT:    vmovups -56(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT:    vmovups (%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT:    vmovups 16(%rsp,%rsi,8), %xmm1
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    movq %rbp, %rsp
+; X64-AVX-NEXT:    popq %rbp
 ; X64-AVX-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
@@ -9746,11 +10232,15 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -9761,29 +10251,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %edi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rdi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r9
@@ -9794,11 +10284,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rdi), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
@@ -9809,11 +10299,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rdi), %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r15
@@ -9826,7 +10316,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r14, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rdi,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r14
@@ -9841,18 +10331,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -9863,61 +10359,66 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rbx, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 48(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 56(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r15, 40(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -9928,36 +10429,36 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %cl
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r10,%r10), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, (%rsp,%rax), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r14,%r14), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r9
@@ -9965,9 +10466,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r10, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r10, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r15,%r15), %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r12, %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r12
@@ -9976,7 +10477,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rax,%rax), %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r14, %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %rcx
@@ -9989,18 +10490,23 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -10011,37 +10517,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rcx, %rax, %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -10054,57 +10560,62 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $192, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %r8d
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%r8,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %r8d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -128(%rsp,%r8), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -104(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%r8), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -96(%rsp,%r8), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 64(%rsp,%r8), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r12,%r12), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -112(%rsp,%r8), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%r8), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r14
@@ -10112,12 +10623,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -88(%rsp,%r8), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 72(%rsp,%r8), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r14, %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 80(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r13, %r15
@@ -10128,13 +10639,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r12, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 88(%rsp,%r8), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r9, %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq %rbx, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -10150,7 +10662,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r13
@@ -10161,42 +10673,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r14
@@ -10210,126 +10726,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r13
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r10, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r14, %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r15, %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %rax, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r15, 48(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r12, 40(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r13
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r11, %r10
@@ -10343,51 +10871,56 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r14, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX1:       # %bb.0:
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    subq $192, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %r9d
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%r9,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 32(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 40(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 56(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 64(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r12,%r12), %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 48(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rbx, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r14
@@ -10395,12 +10928,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 72(%rsp,%r9), %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, %r13
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 80(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r8,%r8), %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r13, %r15
@@ -10411,13 +10944,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r12, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 88(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r9,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r8, %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq %rbx, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
@@ -10433,7 +10967,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r13
@@ -10445,36 +10979,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %rax, %r14
@@ -10488,116 +11026,128 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, (%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 16(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 24(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 32(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r10, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 40(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 48(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r14, %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r15, %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r8, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rcx, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r15, 48(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r12, 40(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rcx, %r11, %r10
@@ -10611,49 +11161,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r14, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    subq $192, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %r9d
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%r9,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 32(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 40(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq %rdi, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 56(%rsp,%r9), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 64(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r12,%r12), %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 48(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r14
@@ -10661,12 +11216,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 72(%rsp,%r9), %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, %r13
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 80(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r8,%r8), %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r13, %r15
@@ -10677,13 +11232,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r12, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 88(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r9,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r8, %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq %rbx, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
@@ -10699,7 +11255,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r13
@@ -10711,33 +11267,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %edi
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rdi,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -96(%rsp,%rdi), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -104(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 32(%rsp,%rdi), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 24(%rsp,%rdi), %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -112(%rsp,%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 16(%rsp,%rdi), %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -80(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -88(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 48(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 40(%rsp,%rdi), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -72(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 56(%rsp,%rdi), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -128(%rsp,%rdi), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -120(%rsp,%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq (%rsp,%rdi), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 8(%rsp,%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %rdi, %r14
@@ -10751,110 +11311,122 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, (%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 16(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 24(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 32(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r10, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 40(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 48(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r14, %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 56(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r15, %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r8, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rcx, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r15, 48(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r12, 40(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r11, %r10
@@ -10868,9 +11440,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
 ;
@@ -13378,7 +13952,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-SSE2-NEXT:    pushq %rbx
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $160, %rsp
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -13389,27 +13967,27 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-SSE2-NEXT:    andl $7, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT:    movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq 16(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT:    movq 40(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT:    movq 32(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT:    movq 56(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT:    movq 48(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
 ; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
 ; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
@@ -13418,80 +13996,94 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rax, (%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    pushq %rax
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $160, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
 ; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
 ; X64-SSE42-NEXT:    movl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SSE42-NEXT:    andl $7, %eax
-; X64-SSE42-NEXT:    movups -128(%rsp,%rax,8), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rax,8), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rax,8), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rax,8), %xmm3
+; X64-SSE42-NEXT:    movups (%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rsp,%rax,8), %xmm2
+; X64-SSE42-NEXT:    movups 48(%rsp,%rax,8), %xmm3
 ; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    popq %rax
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
+; X64-AVX1-NEXT:    pushq %rbp
+; X64-AVX1-NEXT:    movq %rsp, %rbp
+; X64-AVX1-NEXT:    andq $-32, %rsp
+; X64-AVX1-NEXT:    subq $160, %rsp
 ; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-AVX1-NEXT:    andl $7, %eax
-; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
+; X64-AVX1-NEXT:    vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX1-NEXT:    vmovups 16(%rsp,%rax,8), %xmm1
+; X64-AVX1-NEXT:    vmovups 32(%rsp,%rax,8), %xmm2
+; X64-AVX1-NEXT:    vmovups 48(%rsp,%rax,8), %xmm3
 ; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT:    popq %rax
+; X64-AVX1-NEXT:    movq %rbp, %rsp
+; X64-AVX1-NEXT:    popq %rbp
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    pushq %rax
+; X64-AVX512-NEXT:    pushq %rbp
+; X64-AVX512-NEXT:    movq %rsp, %rbp
+; X64-AVX512-NEXT:    andq $-32, %rsp
+; X64-AVX512-NEXT:    subq $160, %rsp
 ; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rsp)
 ; X64-AVX512-NEXT:    andl $7, %eax
-; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
+; X64-AVX512-NEXT:    vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX512-NEXT:    vmovups 16(%rsp,%rax,8), %xmm1
+; X64-AVX512-NEXT:    vmovups 32(%rsp,%rax,8), %xmm2
+; X64-AVX512-NEXT:    vmovups 48(%rsp,%rax,8), %xmm3
 ; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    popq %rax
+; X64-AVX512-NEXT:    movq %rbp, %rsp
+; X64-AVX512-NEXT:    popq %rbp
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
@@ -13716,11 +14308,15 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -13731,25 +14327,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    negl %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movslq %esi, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%rbx), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%rbx), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 64(%rsp,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 72(%rsp,%rbx), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
@@ -13760,11 +14356,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%rbx), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 88(%rsp,%rbx), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%rbx), %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 80(%rsp,%rbx), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r15, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
@@ -13776,11 +14372,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r15, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%rbx), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 104(%rsp,%rbx), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%rbx), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 96(%rsp,%rbx), %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r13, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
@@ -13792,10 +14388,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r13, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -8(%rsp,%rbx), %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 120(%rsp,%rbx), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%rbx), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 112(%rsp,%rbx), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
@@ -13817,18 +14413,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $128, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -13839,37 +14440,37 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negl %esi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movslq %esi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -48(%rsp,%r9), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -40(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 80(%rsp,%r9), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 88(%rsp,%r9), %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -64(%rsp,%r9), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 64(%rsp,%r9), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 72(%rsp,%r9), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -32(%rsp,%r9), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -24(%rsp,%r9), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 96(%rsp,%r9), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 104(%rsp,%r9), %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r11, %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r10, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -16(%rsp,%r9), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 112(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 120(%rsp,%r9), %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r10, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %rbx, %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldq %cl, %r8, %rdi
@@ -13883,18 +14484,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addq $8, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leaq -16(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -13905,35 +14510,35 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 64(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 72(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rdi, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r9, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r8, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 88(%rsp,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 80(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r8, %r8
@@ -13941,9 +14546,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r14, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 104(%rsp,%rsi), %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rbx, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 96(%rsp,%rsi), %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r15, %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r15, %r15
@@ -13951,8 +14556,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r12, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, 120(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 112(%rsp,%rsi), %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rsi, %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrq %rsi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rax, %rsi, %rsi
@@ -13968,18 +14573,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $128, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -13990,37 +14599,37 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %esi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movslq %esi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -48(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -40(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 80(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 88(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -64(%rsp,%r8), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -56(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 64(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 72(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -32(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -24(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 96(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 104(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r11, %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r9, %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -16(%rsp,%r8), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 112(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r9, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %rbx, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldq %cl, %r10, %rdi
@@ -14033,44 +14642,49 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addq $8, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -16(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    negl %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 104(%rsp,%r9), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 96(%rsp,%r9), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -14078,7 +14692,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 88(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -14086,7 +14700,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r11, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 80(%rsp,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r15, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -14094,8 +14708,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %rbx, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 72(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r12, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -14108,7 +14722,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 112(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r12, %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r13
@@ -14116,7 +14730,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 120(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %r12
@@ -14133,52 +14747,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negl %eax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r9
@@ -14192,58 +14812,63 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 104(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 96(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 88(%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r9, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 80(%rsp,%rsi), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r10, %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 72(%rsp,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r11, %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r11, %r11
@@ -14252,12 +14877,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 112(%rsp,%rsi), %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r15, %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, 120(%rsp,%rsi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rax, %r15, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rcx, %rax
@@ -14269,52 +14894,57 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, 32(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%rdi), %xmm3
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r11, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -14327,38 +14957,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    negl %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 104(%rsp,%r9), %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 96(%rsp,%r9), %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %r8
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
@@ -14366,7 +15002,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r10, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 88(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rbx, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
@@ -14374,7 +15010,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r11, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 80(%rsp,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r15, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
@@ -14382,8 +15018,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %rbx, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 72(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r12, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
@@ -14396,7 +15032,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 112(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r12, %r13
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r13
@@ -14404,7 +15040,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 120(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %r12
@@ -14421,47 +15057,53 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    negl %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r9
@@ -14475,53 +15117,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 104(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 96(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 88(%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r9, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 80(%rsp,%rsi), %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r10, %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 72(%rsp,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r11, %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r11, %r11
@@ -14530,12 +15177,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 112(%rsp,%rsi), %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r15, %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, 120(%rsp,%rsi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rax, %r15, %rax
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rcx, %rax
@@ -14547,47 +15194,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r10, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, 32(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r11, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -14600,36 +15252,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rcx,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    negl %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 104(%rsp,%r9), %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 96(%rsp,%r9), %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r8
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
@@ -14637,7 +15295,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r10, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 88(%rsp,%r9), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
@@ -14645,7 +15303,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r11, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 80(%rsp,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
@@ -14653,8 +15311,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbx, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 72(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
@@ -14667,7 +15325,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 112(%rsp,%r9), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, %r13
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r13
@@ -14675,7 +15333,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq 120(%rsp,%r9), %r9
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r12
@@ -14692,44 +15350,50 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    negl %eax
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r9
@@ -14743,50 +15407,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %esi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 104(%rsp,%rsi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %rdi, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 96(%rsp,%rsi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 88(%rsp,%rsi), %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r9, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 80(%rsp,%rsi), %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r10, %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 72(%rsp,%rsi), %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r11, %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r11, %r11
@@ -14795,12 +15464,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 112(%rsp,%rsi), %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r15, %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, 120(%rsp,%rsi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r15, %rax
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rcx, %rax
@@ -14812,44 +15481,49 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, 32(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %eax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 104(%rsp,%r8), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 80(%rsp,%r8), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 72(%rsp,%r8), %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 112(%rsp,%r8), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r14, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r11, %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -14862,9 +15536,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 32(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsi, 40(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
 ;
@@ -17493,7 +18169,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: shl_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-SSE2-NEXT:    pushq %rbx
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $160, %rsp
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -17504,30 +18184,30 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-SSE2-NEXT:    movl (%rsi), %esi
 ; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, (%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    shll $3, %esi
 ; X64-SSE2-NEXT:    andl $56, %esi
 ; X64-SSE2-NEXT:    negl %esi
 ; X64-SSE2-NEXT:    movslq %esi, %rax
-; X64-SSE2-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rax), %r8
-; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %r9
-; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %r10
-; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %r11
-; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq 64(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq 72(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq 88(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq 80(%rsp,%rax), %r8
+; X64-SSE2-NEXT:    movq 104(%rsp,%rax), %r9
+; X64-SSE2-NEXT:    movq 96(%rsp,%rax), %r10
+; X64-SSE2-NEXT:    movq 120(%rsp,%rax), %r11
+; X64-SSE2-NEXT:    movq 112(%rsp,%rax), %rax
 ; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
 ; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
 ; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
@@ -17536,89 +18216,103 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, (%rdx)
 ; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    leaq -8(%rbp), %rsp
 ; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: shl_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    pushq %rax
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $160, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
 ; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
 ; X64-SSE42-NEXT:    movl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, (%rsp)
+; X64-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    shll $3, %eax
 ; X64-SSE42-NEXT:    andl $56, %eax
 ; X64-SSE42-NEXT:    negl %eax
 ; X64-SSE42-NEXT:    cltq
-; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT:    movups 64(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups 80(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups 96(%rsp,%rax), %xmm2
+; X64-SSE42-NEXT:    movups 112(%rsp,%rax), %xmm3
 ; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    popq %rax
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: shl_64bytes_qwordOff:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
+; X64-AVX1-NEXT:    pushq %rbp
+; X64-AVX1-NEXT:    movq %rsp, %rbp
+; X64-AVX1-NEXT:    andq $-32, %rsp
+; X64-AVX1-NEXT:    subq $160, %rsp
 ; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-AVX1-NEXT:    movl (%rsi), %eax
 ; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm2, (%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    shll $3, %eax
 ; X64-AVX1-NEXT:    andl $56, %eax
 ; X64-AVX1-NEXT:    negl %eax
 ; X64-AVX1-NEXT:    cltq
-; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT:    vmovups 64(%rsp,%rax), %xmm0
+; X64-AVX1-NEXT:    vmovups 80(%rsp,%rax), %xmm1
+; X64-AVX1-NEXT:    vmovups 96(%rsp,%rax), %xmm2
+; X64-AVX1-NEXT:    vmovups 112(%rsp,%rax), %xmm3
 ; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT:    popq %rax
+; X64-AVX1-NEXT:    movq %rbp, %rsp
+; X64-AVX1-NEXT:    popq %rbp
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: shl_64bytes_qwordOff:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    pushq %rax
+; X64-AVX512-NEXT:    pushq %rbp
+; X64-AVX512-NEXT:    movq %rsp, %rbp
+; X64-AVX512-NEXT:    andq $-32, %rsp
+; X64-AVX512-NEXT:    subq $160, %rsp
 ; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm1, (%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; X64-AVX512-NEXT:    shll $3, %eax
 ; X64-AVX512-NEXT:    andl $56, %eax
 ; X64-AVX512-NEXT:    negl %eax
 ; X64-AVX512-NEXT:    cltq
-; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT:    vmovups 64(%rsp,%rax), %xmm0
+; X64-AVX512-NEXT:    vmovups 80(%rsp,%rax), %xmm1
+; X64-AVX512-NEXT:    vmovups 96(%rsp,%rax), %xmm2
+; X64-AVX512-NEXT:    vmovups 112(%rsp,%rax), %xmm3
 ; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    popq %rax
+; X64-AVX512-NEXT:    movq %rbp, %rsp
+; X64-AVX512-NEXT:    popq %rbp
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
@@ -17859,11 +18553,15 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r8
@@ -17873,34 +18571,34 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, (%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %r14
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rdi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rdi), %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r9
@@ -17911,11 +18609,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rdi), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r11
@@ -17926,11 +18624,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rdi), %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r15
@@ -17943,7 +18641,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    orq %r14, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq (%rdi,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlq %cl, %r14
@@ -17958,18 +18656,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -17979,66 +18683,71 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rbx, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarq %cl, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 48(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rax, 56(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r15, 40(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -18048,41 +18757,41 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %esi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r8, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %cl
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r10,%r10), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %rdi, %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, (%rsp,%rax), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r8, %r8
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r14,%r14), %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r9, %r9
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r9
@@ -18090,9 +18799,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %r10, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r10, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%r15,%r15), %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r12, %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r12
@@ -18101,7 +18810,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r10, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %r10
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxq %rsi, %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq (%rax,%rax), %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxq %rcx, %r14, %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orq %rbx, %rcx
@@ -18114,18 +18823,23 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, (%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -32(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rdi), %r9
@@ -18135,42 +18849,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 8(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r9, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 24(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r10, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 48(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r14, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r11, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdq %cl, %rax, %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxq %rcx, %rax, %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -18183,63 +18897,68 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-SSE4:       # %bb.0:
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    subq $192, %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rdi), %rax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rdi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq %r8, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%rdi), %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 64(%rsp,%rdi), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r12,%r12), %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %rbx, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r14
@@ -18247,12 +18966,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 72(%rsp,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r14, %r13
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 80(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%r9,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r13, %r15
@@ -18263,13 +18982,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r12, %r14
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq 88(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq (%rdi,%rdi), %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    orq %r9, %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq %rbx, %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ecx
@@ -18285,7 +19005,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-SSE4-NEXT:    popq %r13
@@ -18296,48 +19016,52 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rdi), %rcx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdq %cl, %rax, %r14
@@ -18351,138 +19075,150 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r13
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r10, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r14, %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r15, %r15
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxq %rcx, %rax, %rax
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r15, 48(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r12, 40(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r13
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%rdi), %xmm2
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rdi), %rcx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxq %rcx, %r11, %r10
@@ -18496,61 +19232,66 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r14, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
 ; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r15
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r14
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r13
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    subq $192, %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rdi), %rax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rdi), %rcx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rdi,8), %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %eax
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq %r8, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r8
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%rdi), %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 64(%rsp,%rdi), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r12,%r12), %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r11
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%rdi), %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r14
@@ -18558,12 +19299,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r10
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 72(%rsp,%rdi), %r14
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r14, %r13
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 80(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %r15
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r15
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r13, %r15
@@ -18574,13 +19315,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r12, %r14
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 88(%rsp,%rdi), %rdi
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%rdi,%rdi), %r12
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r9, %r12
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r9
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq %rbx, %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
@@ -18596,7 +19338,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r12
 ; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r13
@@ -18608,46 +19350,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r14
@@ -18661,136 +19407,148 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %r14
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
 ;
 ; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
 ; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subq $160, %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, %ecx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r10, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r14, %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %rbx, %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %r15, %r15
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxq %rcx, %rax, %rax
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 8(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r15, 48(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r12, 40(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbx
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r12
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r13
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r14
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subq $160, %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %ecx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 24(%rsp,%rax), %r9
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, %rsi
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 16(%rsp,%rax), %r10
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, %r8
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 40(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r9, %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rsp,%rax), %r11
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 8(%rsp,%rax), %rax
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r10, %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarxq %rcx, %r11, %r10
@@ -18804,9 +19562,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r14, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbx
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r14
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbp
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
 ;
@@ -20977,119 +21737,135 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbp
+; X64-SSE2-NEXT:    movq %rsp, %rbp
+; X64-SSE2-NEXT:    pushq %r14
 ; X64-SSE2-NEXT:    pushq %rbx
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %r9
-; X64-SSE2-NEXT:    movq 32(%rdi), %r10
-; X64-SSE2-NEXT:    movq 40(%rdi), %r11
-; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
+; X64-SSE2-NEXT:    andq $-32, %rsp
+; X64-SSE2-NEXT:    subq $128, %rsp
+; X64-SSE2-NEXT:    movq (%rdi), %rcx
+; X64-SSE2-NEXT:    movq 8(%rdi), %r8
+; X64-SSE2-NEXT:    movq 16(%rdi), %r9
+; X64-SSE2-NEXT:    movq 24(%rdi), %r10
+; X64-SSE2-NEXT:    movq 32(%rdi), %r11
+; X64-SSE2-NEXT:    movq 40(%rdi), %rbx
+; X64-SSE2-NEXT:    movq 48(%rdi), %r14
 ; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
-; X64-SSE2-NEXT:    movl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movl (%rsi), %eax
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, (%rsp)
 ; X64-SSE2-NEXT:    sarq $63, %rdi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $7, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $7, %eax
+; X64-SSE2-NEXT:    movq (%rsp,%rax,8), %rcx
+; X64-SSE2-NEXT:    movq 8(%rsp,%rax,8), %rsi
+; X64-SSE2-NEXT:    movq 24(%rsp,%rax,8), %rdi
+; X64-SSE2-NEXT:    movq 16(%rsp,%rax,8), %r8
+; X64-SSE2-NEXT:    movq 40(%rsp,%rax,8), %r9
+; X64-SSE2-NEXT:    movq 32(%rsp,%rax,8), %r10
+; X64-SSE2-NEXT:    movq 56(%rsp,%rax,8), %r11
+; X64-SSE2-NEXT:    movq 48(%rsp,%rax,8), %rax
+; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
 ; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
 ; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
 ; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
 ; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    movq %rcx, (%rdx)
+; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    leaq -16(%rbp), %rsp
 ; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    popq %r14
+; X64-SSE2-NEXT:    popq %rbp
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    pushq %rax
+; X64-SSE42-NEXT:    pushq %rbp
+; X64-SSE42-NEXT:    movq %rsp, %rbp
+; X64-SSE42-NEXT:    andq $-32, %rsp
+; X64-SSE42-NEXT:    subq $160, %rsp
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
 ; X64-SSE42-NEXT:    movq 48(%rdi), %rax
 ; X64-SSE42-NEXT:    movq 56(%rdi), %rcx
 ; X64-SSE42-NEXT:    movl (%rsi), %esi
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SSE42-NEXT:    sarq $63, %rcx
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    andl $7, %esi
-; X64-SSE42-NEXT:    movups -128(%rsp,%rsi,8), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rsi,8), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rsi,8), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rsi,8), %xmm3
+; X64-SSE42-NEXT:    movups (%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rsp,%rsi,8), %xmm2
+; X64-SSE42-NEXT:    movups 48(%rsp,%rsi,8), %xmm3
 ; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    popq %rax
+; X64-SSE42-NEXT:    movq %rbp, %rsp
+; X64-SSE42-NEXT:    popq %rbp
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: ashr_64bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    pushq %rbp
+; X64-AVX-NEXT:    movq %rsp, %rbp
+; X64-AVX-NEXT:    andq $-32, %rsp
+; X64-AVX-NEXT:    subq $160, %rsp
 ; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-AVX-NEXT:    movq 48(%rdi), %rax
 ; X64-AVX-NEXT:    movq 56(%rdi), %rcx
 ; X64-AVX-NEXT:    movl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    andl $7, %esi
-; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi,8), %xmm1
-; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi,8), %xmm2
-; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi,8), %xmm3
+; X64-AVX-NEXT:    vmovups (%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT:    vmovups 16(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT:    vmovups 32(%rsp,%rsi,8), %xmm2
+; X64-AVX-NEXT:    vmovups 48(%rsp,%rsi,8), %xmm3
 ; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    movq %rbp, %rsp
+; X64-AVX-NEXT:    popq %rbp
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 221a51ed44696..0cbf1b4bbb586 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -1389,31 +1389,35 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%r8,8), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -1426,7 +1430,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%r8,8), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
@@ -1437,32 +1441,38 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
@@ -1470,40 +1480,46 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi,8), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rsi,8), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
@@ -1512,31 +1528,37 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
@@ -1544,6 +1566,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1939,26 +1963,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r10), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r10), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%r10), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%r10), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
@@ -1970,10 +1998,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r10), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rsp,%r10), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r10), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%r10), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -1991,33 +2019,39 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
@@ -2025,30 +2059,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%rdi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
@@ -2056,8 +2096,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
@@ -2069,32 +2109,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
@@ -2102,6 +2148,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2512,33 +2560,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%r8,8), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -2551,7 +2603,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%r8,8), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
@@ -2562,34 +2614,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
@@ -2597,42 +2655,48 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi,8), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rsi,8), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
@@ -2641,33 +2705,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
@@ -2675,6 +2745,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -3094,12 +3166,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $192, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3110,34 +3184,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
@@ -3146,11 +3221,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 64(%rsp,%r8), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
@@ -3161,12 +3236,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 72(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 80(%rsp,%r8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
@@ -3177,12 +3252,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
@@ -3192,8 +3267,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -3204,9 +3280,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3217,62 +3297,67 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq -24(%rbp), %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3283,82 +3368,87 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rax), %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%rax), %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq -32(%rbp), %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3369,52 +3459,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -4214,11 +4306,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4229,26 +4325,26 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 72(%rsp,%rbx), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
@@ -4259,11 +4355,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 88(%rsp,%rbx), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 80(%rsp,%rbx), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -4275,11 +4371,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 104(%rsp,%rbx), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 96(%rsp,%rbx), %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -4291,10 +4387,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%rbx), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 120(%rsp,%rbx), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 112(%rsp,%rbx), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -4316,18 +4412,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $128, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4338,38 +4439,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 80(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 88(%rsp,%r9), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r9), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 72(%rsp,%r9), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r9), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 96(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 104(%rsp,%r9), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r9), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 112(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 120(%rsp,%r9), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
@@ -4383,18 +4484,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq -16(%rbp), %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4405,18 +4510,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -4424,17 +4529,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 64(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 72(%rsp,%rsi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 88(%rsp,%rsi), %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 80(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
@@ -4442,9 +4547,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 104(%rsp,%rsi), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 96(%rsp,%rsi), %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r15, %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r15, %r15
@@ -4452,8 +4557,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, 120(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 112(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
@@ -4469,18 +4574,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq -32(%rbp), %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $128, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4491,38 +4600,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 80(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 88(%rsp,%r8), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r8), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 72(%rsp,%r8), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r8), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 96(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 104(%rsp,%r8), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r8), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 112(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 120(%rsp,%r8), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
@@ -4535,9 +4644,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq -16(%rbp), %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5353,12 +5463,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $192, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5368,39 +5480,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
@@ -5409,11 +5522,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 64(%rsp,%r8), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
@@ -5424,12 +5537,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 72(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 80(%rsp,%r8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
@@ -5440,12 +5553,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 88(%rsp,%r8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
@@ -5455,8 +5568,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -40(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -5467,9 +5581,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5479,67 +5597,72 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq -24(%rbp), %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5549,87 +5672,92 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rax), %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%rax), %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq -32(%rbp), %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5639,57 +5767,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rsp,%rax), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq -24(%rbp), %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fde915247760a..7b68d3bfdff87 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -964,19 +964,25 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $96, %rsp
 ; X64-NEXT:    movups (%rdi), %xmm0
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    leal (,%rsi,8), %eax
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NEXT:    shrb $6, %al
 ; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    leaq (%rsp,%rax,8), %rax
 ; X64-NEXT:    andl $7, %esi
 ; X64-NEXT:    movzbl (%rsi,%rax), %eax
 ; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1079,19 +1085,23 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rax,8), %rsi
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-NO-BMI2-NEXT:    addl %eax, %eax
 ; X64-NO-BMI2-NEXT:    andb $56, %cl
 ; X64-NO-BMI2-NEXT:    notb %cl
@@ -1099,29 +1109,37 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NEXT:    orl %esi, %eax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $96, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
 ; X64-BMI2-NEXT:    andb $56, %sil
 ; X64-BMI2-NEXT:    notb %sil
-; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-BMI2-NEXT:    addl %eax, %eax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1223,19 +1241,23 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rax,8), %rsi
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-NO-BMI2-NEXT:    addl %eax, %eax
 ; X64-NO-BMI2-NEXT:    andb $56, %cl
 ; X64-NO-BMI2-NEXT:    notb %cl
@@ -1243,29 +1265,37 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NEXT:    orl %esi, %eax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $96, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
 ; X64-BMI2-NEXT:    andb $56, %sil
 ; X64-BMI2-NEXT:    notb %sil
-; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-BMI2-NEXT:    addl %eax, %eax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1367,18 +1397,22 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
@@ -1386,46 +1420,60 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    pushq %rbp
+; X64-SHLD-NEXT:    movq %rsp, %rbp
+; X64-SHLD-NEXT:    andq $-32, %rsp
+; X64-SHLD-NEXT:    subq $96, %rsp
 ; X64-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SHLD-NEXT:    movl %ecx, %eax
 ; X64-SHLD-NEXT:    shrb $6, %al
 ; X64-SHLD-NEXT:    movzbl %al, %eax
-; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    movq %rbp, %rsp
+; X64-SHLD-NEXT:    popq %rbp
 ; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1566,18 +1614,22 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rdi,8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
@@ -1588,34 +1640,40 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rdi,8), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi,8), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
@@ -1623,25 +1681,31 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, (%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rcx,8), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
@@ -1651,27 +1715,33 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
@@ -1679,6 +1749,8 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1866,59 +1938,67 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rsi), %rax
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rsi), %esi
 ; X64-NO-BMI2-NEXT:    addl %esi, %esi
 ; X64-NO-BMI2-NEXT:    notl %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NEXT:    orl %eax, %esi
 ; X64-NO-BMI2-NEXT:    movb %sil, (%rdx)
-; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $160, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    movl 8(%rsp,%rsi), %edi
 ; X64-BMI2-NEXT:    addl %edi, %edi
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    shrxq %rcx, (%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movb %cl, (%rdx)
-; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2030,59 +2110,67 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rsi), %rax
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rsi), %esi
 ; X64-NO-BMI2-NEXT:    addl %esi, %esi
 ; X64-NO-BMI2-NEXT:    notl %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NEXT:    orl %eax, %esi
 ; X64-NO-BMI2-NEXT:    movw %si, (%rdx)
-; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $160, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    movl 8(%rsp,%rsi), %edi
 ; X64-BMI2-NEXT:    addl %edi, %edi
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    shrxq %rcx, (%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movw %cx, (%rdx)
-; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2193,59 +2281,67 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rsi), %rax
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rsi), %esi
 ; X64-NO-BMI2-NEXT:    addl %esi, %esi
 ; X64-NO-BMI2-NEXT:    notl %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NEXT:    orl %eax, %esi
 ; X64-NO-BMI2-NEXT:    movl %esi, (%rdx)
-; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $160, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    movl 8(%rsp,%rsi), %edi
 ; X64-BMI2-NEXT:    addl %edi, %edi
 ; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    shrxq %rcx, (%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movl %ecx, (%rdx)
-; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2356,22 +2452,25 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
@@ -2379,57 +2478,66 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    pushq %rax
+; X64-SHLD-NEXT:    pushq %rbp
+; X64-SHLD-NEXT:    movq %rsp, %rbp
+; X64-SHLD-NEXT:    andq $-32, %rsp
+; X64-SHLD-NEXT:    subq $160, %rsp
 ; X64-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-SHLD-NEXT:    andl $56, %esi
-; X64-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT:    movq (%rsp,%rsi), %rax
+; X64-SHLD-NEXT:    movq 8(%rsp,%rsi), %rsi
 ; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
 ; X64-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-SHLD-NEXT:    popq %rax
+; X64-SHLD-NEXT:    movq %rbp, %rsp
+; X64-SHLD-NEXT:    popq %rbp
 ; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, (%rsp,%rsi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2582,24 +2690,27 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
@@ -2609,41 +2720,45 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
@@ -2652,32 +2767,36 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, (%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
@@ -2688,40 +2807,45 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rdi, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rax, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2916,26 +3040,29 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $128, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %r8d
@@ -2948,71 +3075,75 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rsi), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq -16(%rbp), %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $128, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi), %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rsi), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
@@ -3024,90 +3155,101 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq -16(%rbp), %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq -8(%rbp), %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $160, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r11, %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rsp,%rsi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r9, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rax
@@ -3118,7 +3260,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq -8(%rbp), %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index bed8e5806380c..1fe9a148c2bee 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1193,20 +1193,26 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $96, %rsp
 ; X64-NEXT:    movups (%rdi), %xmm0
 ; X64-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NEXT:    leal (,%rsi,8), %eax
 ; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NEXT:    shrb $6, %al
 ; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    leaq (%rsp,%rax,8), %rax
 ; X64-NEXT:    andl $7, %esi
 ; X64-NEXT:    movzbl (%rsi,%rax), %eax
 ; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
@@ -1310,20 +1316,24 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rax,8), %rsi
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-NO-BMI2-NEXT:    addl %eax, %eax
 ; X64-NO-BMI2-NEXT:    andb $56, %cl
 ; X64-NO-BMI2-NEXT:    notb %cl
@@ -1331,30 +1341,38 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NEXT:    orl %esi, %eax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $96, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
 ; X64-BMI2-NEXT:    andb $56, %sil
 ; X64-BMI2-NEXT:    notb %sil
-; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-BMI2-NEXT:    addl %eax, %eax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
@@ -1457,20 +1475,24 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rbp
+; X64-NO-BMI2-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    movq (%rsp,%rax,8), %rsi
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-NO-BMI2-NEXT:    addl %eax, %eax
 ; X64-NO-BMI2-NEXT:    andb $56, %cl
 ; X64-NO-BMI2-NEXT:    notb %cl
@@ -1478,30 +1500,38 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NEXT:    orl %esi, %eax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NEXT:    popq %rbp
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rbp
+; X64-BMI2-NEXT:    movq %rsp, %rbp
+; X64-BMI2-NEXT:    andq $-32, %rsp
+; X64-BMI2-NEXT:    subq $96, %rsp
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, (%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
 ; X64-BMI2-NEXT:    andb $56, %sil
 ; X64-BMI2-NEXT:    notb %sil
-; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    movl 8(%rsp,%rax,8), %eax
 ; X64-BMI2-NEXT:    addl %eax, %eax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    movq %rbp, %rsp
+; X64-BMI2-NEXT:    popq %rbp
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
@@ -1604,19 +1634,23 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
@@ -1624,48 +1658,62 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    pushq %rbp
+; X64-SHLD-NEXT:    movq %rsp, %rbp
+; X64-SHLD-NEXT:    andq $-32, %rsp
+; X64-SHLD-NEXT:    subq $96, %rsp
 ; X64-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-SHLD-NEXT:    movl %ecx, %eax
 ; X64-SHLD-NEXT:    shrb $6, %al
 ; X64-SHLD-NEXT:    movzbl %al, %eax
-; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    movq %rbp, %rsp
+; X64-SHLD-NEXT:    popq %rbp
 ; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rsp,%rax,8), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
@@ -1807,19 +1855,23 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rdi,8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
@@ -1830,35 +1882,41 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rdi,8), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rsi,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rsi,8), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
@@ -1866,26 +1924,32 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, (%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rsp,%rcx,8), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
@@ -1895,28 +1959,34 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subq $96, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rsp,%rax,8), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
@@ -1924,6 +1994,8 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
index 23c80f4bb523d..326562a425086 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
@@ -4,8 +4,7 @@
 
 define i129 @halftosi129(half %a) {
 ; CHECK-LABEL: @halftosi129(
-; CHECK-NEXT:    [[TMP1:%.*]] = fptosi half [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi half [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP2]]
 ;
   %conv = fptosi half %a to i129
@@ -14,35 +13,7 @@ define i129 @halftosi129(half %a) {
 
 define i129 @floattosi129(float %a) {
 ; CHECK-LABEL: @floattosi129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze float [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP0]], -1
-; CHECK-NEXT:    [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP0]], 23
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i32 [[TMP5]], 255
-; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP0]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i32 [[TMP3]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i129
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptosi float %a to i129
@@ -51,35 +22,7 @@ define i129 @floattosi129(float %a) {
 
 define i129 @doubletosi129(double %a) {
 ; CHECK-LABEL: @doubletosi129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze double [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double [[A]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i64 [[TMP0]], -1
-; CHECK-NEXT:    [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP0]], 52
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i64 [[TMP5]], 2047
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 4503599627370495
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i64 [[TMP3]], 4503599627370496
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i64 [[BIASED_EXP]], 1023
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i64 [[BIASED_EXP]], 1075
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 1075, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i64 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP7]] to i129
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[BIASED_EXP]], -1075
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i64 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i64 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptosi double [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptosi double %a to i129
@@ -88,36 +31,7 @@ define i129 @doubletosi129(double %a) {
 
 define i129 @x86_fp80tosi129(x86_fp80 %a) {
 ; CHECK-LABEL: @x86_fp80tosi129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze x86_fp80 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = fpext x86_fp80 [[A]] to fp128
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast fp128 [[TMP0]] to i128
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i128 [[TMP1]], -1
-; CHECK-NEXT:    [[SIGN:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i128 [[TMP1]], 112
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i128 [[TMP6]], 32767
-; CHECK-NEXT:    [[TMP4:%.*]] = and i128 [[TMP1]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i128 [[TMP4]], 5192296858534827628530496329220096
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP15]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP8]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i129 [[TMP9]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP16:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i128 [[TMP16]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i129 [[TMP12]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i129 [ [[TMP10]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP13]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = fptosi x86_fp80 [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP14]]
 ;
   %conv = fptosi x86_fp80 %a to i129
@@ -126,35 +40,7 @@ define i129 @x86_fp80tosi129(x86_fp80 %a) {
 
 define i129 @fp128tosi129(fp128 %a) {
 ; CHECK-LABEL: @fp128tosi129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze fp128 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast fp128 [[A]] to i128
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i128 [[TMP0]], -1
-; CHECK-NEXT:    [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i128 [[TMP0]], 112
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i128 [[TMP5]], 32767
-; CHECK-NEXT:    [[TMP3:%.*]] = and i128 [[TMP0]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i128 [[TMP3]], 5192296858534827628530496329220096
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP7]] to i129
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i128 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptosi fp128 [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptosi fp128 %a to i129
@@ -163,67 +49,7 @@ define i129 @fp128tosi129(fp128 %a) {
 
 define <2 x i129> @floattosi129v2(<2 x float> %a) {
 ; CHECK-LABEL: @floattosi129v2(
-; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = freeze float [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
-; CHECK-NEXT:    [[SIGN7:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP1]], 23
-; CHECK-NEXT:    [[BIASED_EXP8:%.*]] = and i32 [[TMP6]], 255
-; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND10:%.*]] = or i32 [[TMP4]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE10:%.*]] = icmp ult i32 [[BIASED_EXP8]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE10]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE2:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size2:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH12:%.*]] = icmp ult i32 [[BIASED_EXP8]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH12]], label [[FP_TO_I_IF_EXP_SMALL5:%.*]], label [[FP_TO_I_IF_EXP_LARGE6:%.*]]
-; CHECK:       fp-to-i-if-exp.small3:
-; CHECK-NEXT:    [[TMP18:%.*]] = sub i32 150, [[BIASED_EXP8]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[SIGNIFICAND10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i129 [[TMP9]], [[SIGN7]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
-; CHECK:       fp-to-i-if-exp.large4:
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[BIASED_EXP8]], -150
-; CHECK-NEXT:    [[SIGNIFICAND9:%.*]] = zext i32 [[SIGNIFICAND10]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP20]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i129 [[SIGNIFICAND9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i129 [[TMP12]], [[SIGN7]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
-; CHECK:       fp-to-i-cleanup1:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i129 [ [[TMP10]], [[FP_TO_I_IF_EXP_SMALL5]] ], [ [[TMP13]], [[FP_TO_I_IF_EXP_LARGE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i129> poison, i129 [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[A]], i64 1
-; CHECK-NEXT:    [[TMP35:%.*]] = freeze float [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP35]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], -1
-; CHECK-NEXT:    [[SIGN:%.*]] = select i1 [[TMP19]], i129 1, i129 -1
-; CHECK-NEXT:    [[TMP21:%.*]] = lshr i32 [[TMP17]], 23
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i32 [[TMP21]], 255
-; CHECK-NEXT:    [[TMP22:%.*]] = and i32 [[TMP17]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i32 [[TMP22]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP33:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP33]] to i129
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i129 [[TMP25]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP34]] to i129
-; CHECK-NEXT:    [[TMP28:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i129 [[TMP28]], [[SIGN]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi i129 [ [[TMP26]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP29]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i129> [[TMP15]], i129 [[TMP30]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = fptosi <2 x float> [[A:%.*]] to <2 x i129>
 ; CHECK-NEXT:    ret <2 x i129> [[TMP31]]
 ;
   %conv = fptosi <2 x float> %a to <2 x i129>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
index 864f13fe61624..7f2ea43498de0 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
@@ -4,8 +4,7 @@
 
 define i129 @halftoui129(half %a) {
 ; CHECK-LABEL: @halftoui129(
-; CHECK-NEXT:    [[TMP1:%.*]] = fptoui half [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui half [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP2]]
 ;
   %conv = fptoui half %a to i129
@@ -14,31 +13,7 @@ define i129 @halftoui129(half %a) {
 
 define i129 @floattoui129(float %a) {
 ; CHECK-LABEL: @floattoui129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze float [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP0]], 23
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i32 [[TMP5]], 255
-; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP0]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i32 [[TMP3]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptoui float %a to i129
@@ -47,31 +22,7 @@ define i129 @floattoui129(float %a) {
 
 define i129 @doubletoui129(double %a) {
 ; CHECK-LABEL: @doubletoui129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze double [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double [[A]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP0]], 52
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i64 [[TMP5]], 2047
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 4503599627370495
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i64 [[TMP3]], 4503599627370496
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i64 [[BIASED_EXP]], 1023
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i64 [[BIASED_EXP]], 1075
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 1075, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i64 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP7]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[BIASED_EXP]], -1075
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i64 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i64 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptoui double [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptoui double %a to i129
@@ -80,32 +31,7 @@ define i129 @doubletoui129(double %a) {
 
 define i129 @x86_fp80toui129(x86_fp80 %a) {
 ; CHECK-LABEL: @x86_fp80toui129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze x86_fp80 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = fpext x86_fp80 [[A]] to fp128
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast fp128 [[TMP0]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i128 [[TMP1]], 112
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i128 [[TMP6]], 32767
-; CHECK-NEXT:    [[TMP4:%.*]] = and i128 [[TMP1]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i128 [[TMP4]], 5192296858534827628530496329220096
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP15]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP8]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP16:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i128 [[TMP16]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP11]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = fptoui x86_fp80 [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP14]]
 ;
   %conv = fptoui x86_fp80 %a to i129
@@ -114,31 +40,7 @@ define i129 @x86_fp80toui129(x86_fp80 %a) {
 
 define i129 @fp128toui129(fp128 %a) {
 ; CHECK-LABEL: @fp128toui129(
-; CHECK-NEXT:  fp-to-i-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze fp128 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast fp128 [[A]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i128 [[TMP0]], 112
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i128 [[TMP5]], 32767
-; CHECK-NEXT:    [[TMP3:%.*]] = and i128 [[TMP0]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i128 [[TMP3]], 5192296858534827628530496329220096
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP7]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP15:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i128 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = fptoui fp128 [[A:%.*]] to i129
 ; CHECK-NEXT:    ret i129 [[TMP13]]
 ;
   %conv = fptoui fp128 %a to i129
@@ -147,59 +49,7 @@ define i129 @fp128toui129(fp128 %a) {
 
 define <2 x i129> @floattoui129v2(<2 x float> %a) {
 ; CHECK-LABEL: @floattoui129v2(
-; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = freeze float [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP1]], 23
-; CHECK-NEXT:    [[BIASED_EXP8:%.*]] = and i32 [[TMP6]], 255
-; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND10:%.*]] = or i32 [[TMP4]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE10:%.*]] = icmp ult i32 [[BIASED_EXP8]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE10]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE2:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size2:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH12:%.*]] = icmp ult i32 [[BIASED_EXP8]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH12]], label [[FP_TO_I_IF_EXP_SMALL5:%.*]], label [[FP_TO_I_IF_EXP_LARGE6:%.*]]
-; CHECK:       fp-to-i-if-exp.small3:
-; CHECK-NEXT:    [[TMP18:%.*]] = sub i32 150, [[BIASED_EXP8]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[SIGNIFICAND10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
-; CHECK:       fp-to-i-if-exp.large4:
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[BIASED_EXP8]], -150
-; CHECK-NEXT:    [[SIGNIFICAND9:%.*]] = zext i32 [[SIGNIFICAND10]] to i129
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP20]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = shl i129 [[SIGNIFICAND9]], [[TMP11]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
-; CHECK:       fp-to-i-cleanup1:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL5]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i129> poison, i129 [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[A]], i64 1
-; CHECK-NEXT:    [[TMP35:%.*]] = freeze float [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP35]] to i32
-; CHECK-NEXT:    [[TMP21:%.*]] = lshr i32 [[TMP17]], 23
-; CHECK-NEXT:    [[BIASED_EXP:%.*]] = and i32 [[TMP21]], 255
-; CHECK-NEXT:    [[TMP22:%.*]] = and i32 [[TMP17]], 8388607
-; CHECK-NEXT:    [[SIGNIFICAND1:%.*]] = or i32 [[TMP22]], 8388608
-; CHECK-NEXT:    [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT:    br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK:       fp-to-i-if-check.exp.size:
-; CHECK-NEXT:    [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT:    br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK:       fp-to-i-if-exp.small:
-; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT:    [[TMP33:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP33]] to i129
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-if-exp.large:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT:    [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP34]] to i129
-; CHECK-NEXT:    [[TMP28:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP27]]
-; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
-; CHECK:       fp-to-i-cleanup:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi i129 [ [[TMP25]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP28]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i129> [[TMP15]], i129 [[TMP30]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = fptoui <2 x float> [[A:%.*]] to <2 x i129>
 ; CHECK-NEXT:    ret <2 x i129> [[TMP31]]
 ;
   %conv = fptoui <2 x float> %a to <2 x i129>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
index a3677bafb4449..ec8628688489c 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
@@ -4,84 +4,7 @@
 
 define half @si129tohalf(i129 %a) {
 ; CHECK-LABEL: @si129tohalf(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP47]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT:    [[TMP52:%.*]] = fptrunc float [[TMP51]] to half
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP53:%.*]] = phi half [ [[TMP52]], [[ITOFP_IF_END26]] ], [ 0xH0000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = sitofp i129 [[A:%.*]] to half
 ; CHECK-NEXT:    ret half [[TMP53]]
 ;
   %conv = sitofp i129 %a to half
@@ -90,83 +13,7 @@ define half @si129tohalf(i129 %a) {
 
 define float @si129tofloat(i129 %a) {
 ; CHECK-LABEL: @si129tofloat(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP47]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP52:%.*]] = phi float [ [[TMP51]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = sitofp i129 [[A:%.*]] to float
 ; CHECK-NEXT:    ret float [[TMP52]]
 ;
   %conv = sitofp i129 %a to float
@@ -175,88 +22,7 @@ define float @si129tofloat(i129 %a) {
 
 define double @si129todouble(i129 %a) {
 ; CHECK-LABEL: @si129todouble(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 74, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 55
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 36028797018963968
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -76
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i64
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP35]], [[ITOFP_IF_THEN20]] ], [ [[TMP31]], [[ITOFP_SW_EPILOG]] ], [ [[TMP41]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 20
-; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1072693248
-; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 1048575
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
-; CHECK-NEXT:    [[TMP52:%.*]] = zext i32 [[TMP51]] to i64
-; CHECK-NEXT:    [[TMP53:%.*]] = shl i64 [[TMP52]], 32
-; CHECK-NEXT:    [[TMP54:%.*]] = and i64 [[TMP42]], 4294967295
-; CHECK-NEXT:    [[TMP55:%.*]] = or i64 [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i64 [[TMP55]] to double
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP57:%.*]] = phi double [ [[TMP56]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i129 [[A:%.*]] to double
 ; CHECK-NEXT:    ret double [[TMP57]]
 ;
   %conv = sitofp i129 %a to double
@@ -265,83 +31,7 @@ define double @si129todouble(i129 %a) {
 
 define x86_fp80 @si129tox86_fp80(i129 %a) {
 ; CHECK-LABEL: @si129tox86_fp80(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = lshr i129 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and i129 [[TMP13]], [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT:    [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr i129 [[TMP24]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP30:%.*]] = ashr i129 [[TMP24]], 3
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT:    [[TMP35:%.*]] = shl i129 [[TMP3]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT:    [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT:    [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT:    [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT:    [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT:    [[TMP50:%.*]] = fptrunc fp128 [[TMP49]] to x86_fp80
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP51:%.*]] = phi x86_fp80 [ [[TMP50]], [[ITOFP_IF_END26]] ], [ 0xK00000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP51:%.*]] = sitofp i129 [[A:%.*]] to x86_fp80
 ; CHECK-NEXT:    ret x86_fp80 [[TMP51]]
 ;
   %conv = sitofp i129 %a to x86_fp80
@@ -350,82 +40,7 @@ define x86_fp80 @si129tox86_fp80(i129 %a) {
 
 define fp128 @si129tofp128(i129 %a) {
 ; CHECK-LABEL: @si129tofp128(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = lshr i129 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and i129 [[TMP13]], [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT:    [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr i129 [[TMP24]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP30:%.*]] = ashr i129 [[TMP24]], 3
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT:    [[TMP35:%.*]] = shl i129 [[TMP3]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT:    [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT:    [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT:    [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT:    [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP50:%.*]] = phi fp128 [ [[TMP49]], [[ITOFP_IF_END26]] ], [ 0xL00000000000000000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP50:%.*]] = sitofp i129 [[A:%.*]] to fp128
 ; CHECK-NEXT:    ret fp128 [[TMP50]]
 ;
   %conv = sitofp i129 %a to fp128
@@ -434,163 +49,7 @@ define fp128 @si129tofp128(i129 %a) {
 
 define <2 x float> @si129tofloatv2(<2 x i129> %a) {
 ; CHECK-LABEL: @si129tofloatv2(
-; CHECK-NEXT:  itofp-entryitofp-entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP110:%.*]] = freeze i129 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP110]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
-; CHECK:       itofp-if-end2:
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP110]], 128
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP110]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
-; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
-; CHECK:       itofp-if-then43:
-; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb4:
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP4]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
-; CHECK:       itofp-sw-default5:
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
-; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
-; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
-; CHECK:       itofp-sw-epilog6:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP4]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
-; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = ashr i129 [[TMP27]], 2
-; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
-; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
-; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
-; CHECK:       itofp-if-then207:
-; CHECK-NEXT:    [[TMP33:%.*]] = ashr i129 [[TMP27]], 3
-; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
-; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
-; CHECK:       itofp-if-else8:
-; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
-; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
-; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP4]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
-; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
-; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
-; CHECK:       itofp-if-end269:
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
-; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
-; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
-; CHECK:       itofp-return1:
-; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
-; CHECK-NEXT:    [[TMP111:%.*]] = freeze i129 [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP111]], 0
-; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP111]], 128
-; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP111]]
-; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
-; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP59]], i1 true)
-; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
-; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
-; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
-; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP59]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
-; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
-; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP59]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
-; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
-; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP59]]
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
-; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
-; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP59]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
-; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
-; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
-; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
-; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
-; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
-; CHECK-NEXT:    [[TMP83:%.*]] = ashr i129 [[TMP82]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
-; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
-; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
-; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP88:%.*]] = ashr i129 [[TMP82]], 3
-; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
-; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
-; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
-; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
-; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP59]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
-; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
-; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
-; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
-; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
-; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
-; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
-; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
-; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP105]], [[TMP103]]
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    [[TMP109:%.*]] = sitofp <2 x i129> [[A:%.*]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[TMP109]]
 ;
   %conv = sitofp <2 x i129> %a to <2 x float>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
index eed61b7c53989..ea161746e49c6 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
@@ -4,84 +4,7 @@
 
 define half @ui129tohalf(i129 %a) {
 ; CHECK-LABEL: @ui129tohalf(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT:    [[TMP52:%.*]] = fptrunc float [[TMP51]] to half
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP53:%.*]] = phi half [ [[TMP52]], [[ITOFP_IF_END26]] ], [ 0xH0000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = uitofp i129 [[A:%.*]] to half
 ; CHECK-NEXT:    ret half [[TMP53]]
 ;
   %conv = uitofp i129 %a to half
@@ -90,83 +13,7 @@ define half @ui129tohalf(i129 %a) {
 
 define float @ui129tofloat(i129 %a) {
 ; CHECK-LABEL: @ui129tofloat(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP52:%.*]] = phi float [ [[TMP51]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = uitofp i129 [[A:%.*]] to float
 ; CHECK-NEXT:    ret float [[TMP52]]
 ;
   %conv = uitofp i129 %a to float
@@ -175,88 +22,7 @@ define float @ui129tofloat(i129 %a) {
 
 define double @ui129todouble(i129 %a) {
 ; CHECK-LABEL: @ui129todouble(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 74, [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP5]], 55
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT:    [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP26]], 36028797018963968
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT:    br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT:    [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP5]], -76
-; CHECK-NEXT:    [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT:    [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = trunc i129 [[TMP38]] to i64
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT:    [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP42:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP35]], [[ITOFP_IF_THEN20]] ], [ [[TMP31]], [[ITOFP_SW_EPILOG]] ], [ [[TMP41]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 20
-; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1072693248
-; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 1048575
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
-; CHECK-NEXT:    [[TMP52:%.*]] = zext i32 [[TMP51]] to i64
-; CHECK-NEXT:    [[TMP53:%.*]] = shl i64 [[TMP52]], 32
-; CHECK-NEXT:    [[TMP54:%.*]] = and i64 [[TMP42]], 4294967295
-; CHECK-NEXT:    [[TMP55:%.*]] = or i64 [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i64 [[TMP55]] to double
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP57:%.*]] = phi double [ [[TMP56]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = uitofp i129 [[A:%.*]] to double
 ; CHECK-NEXT:    ret double [[TMP57]]
 ;
   %conv = uitofp i129 %a to double
@@ -265,83 +31,7 @@ define double @ui129todouble(i129 %a) {
 
 define x86_fp80 @ui129tox86_fp80(i129 %a) {
 ; CHECK-LABEL: @ui129tox86_fp80(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = lshr i129 [[A]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and i129 [[TMP13]], [[A]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT:    [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = lshr i129 [[TMP24]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP24]], 3
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT:    [[TMP35:%.*]] = shl i129 [[A]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT:    [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT:    [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT:    [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT:    [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT:    [[TMP50:%.*]] = fptrunc fp128 [[TMP49]] to x86_fp80
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP51:%.*]] = phi x86_fp80 [ [[TMP50]], [[ITOFP_IF_END26]] ], [ 0xK00000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i129 [[A:%.*]] to x86_fp80
 ; CHECK-NEXT:    ret x86_fp80 [[TMP51]]
 ;
   %conv = uitofp i129 %a to x86_fp80
@@ -350,82 +40,7 @@ define x86_fp80 @ui129tox86_fp80(i129 %a) {
 
 define fp128 @ui129tofp128(i129 %a) {
 ; CHECK-LABEL: @ui129tofp128(
-; CHECK-NEXT:  itofp-entry:
-; CHECK-NEXT:    [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = lshr i129 [[A]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and i129 [[TMP13]], [[A]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT:    [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT:    [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = lshr i129 [[TMP24]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP30:%.*]] = lshr i129 [[TMP24]], 3
-; CHECK-NEXT:    [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT:    [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT:    [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT:    [[TMP35:%.*]] = shl i129 [[A]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT:    [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT:    [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT:    [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT:    [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT:    [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP50:%.*]] = phi fp128 [ [[TMP49]], [[ITOFP_IF_END26]] ], [ 0xL00000000000000000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i129 [[A:%.*]] to fp128
 ; CHECK-NEXT:    ret fp128 [[TMP50]]
 ;
   %conv = uitofp i129 %a to fp128
@@ -434,163 +49,7 @@ define fp128 @ui129tofp128(i129 %a) {
 
 define <2 x float> @ui129tofloatv2(<2 x i129> %a) {
 ; CHECK-LABEL: @ui129tofloatv2(
-; CHECK-NEXT:  itofp-entryitofp-entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = freeze i129 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP10]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
-; CHECK:       itofp-if-end2:
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP10]], 128
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP10]], i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
-; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
-; CHECK:       itofp-if-then43:
-; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb4:
-; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP10]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
-; CHECK:       itofp-sw-default5:
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
-; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP10]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
-; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
-; CHECK:       itofp-sw-epilog6:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP10]], [[ITOFP_IF_THEN43]] ], [ [[TMP65]], [[ITOFP_SW_BB4]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
-; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP27]], 2
-; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
-; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
-; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
-; CHECK:       itofp-if-then207:
-; CHECK-NEXT:    [[TMP33:%.*]] = lshr i129 [[TMP27]], 3
-; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
-; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
-; CHECK:       itofp-if-else8:
-; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
-; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
-; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP10]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
-; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
-; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
-; CHECK:       itofp-if-end269:
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
-; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
-; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
-; CHECK:       itofp-return1:
-; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
-; CHECK-NEXT:    [[TMP110:%.*]] = freeze i129 [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP110]], 0
-; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK:       itofp-if-end:
-; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP110]], 128
-; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP110]]
-; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
-; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP110]], i1 true)
-; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
-; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
-; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
-; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK:       itofp-if-then4:
-; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       itofp-sw-bb:
-; CHECK-NEXT:    [[TMP111:%.*]] = shl i129 [[TMP110]], 1
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-default:
-; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
-; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
-; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP110]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
-; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
-; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP110]]
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
-; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
-; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
-; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
-; CHECK:       itofp-sw-epilog:
-; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP110]], [[ITOFP_IF_THEN4]] ], [ [[TMP111]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
-; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
-; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
-; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
-; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
-; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
-; CHECK-NEXT:    [[TMP83:%.*]] = lshr i129 [[TMP82]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
-; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
-; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
-; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK:       itofp-if-then20:
-; CHECK-NEXT:    [[TMP88:%.*]] = lshr i129 [[TMP82]], 3
-; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
-; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
-; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-else:
-; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
-; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
-; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP110]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
-; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
-; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
-; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
-; CHECK:       itofp-if-end26:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
-; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
-; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
-; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
-; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
-; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
-; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP104]], [[TMP103]]
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
-; CHECK-NEXT:    br label [[ITOFP_RETURN]]
-; CHECK:       itofp-return:
-; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    [[TMP109:%.*]] = uitofp <2 x i129> [[A:%.*]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[TMP109]]
 ;
   %conv = uitofp <2 x i129> %a to <2 x float>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
index a0e00be2a94ff..87a0260c5c2f7 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
@@ -8,88 +8,8 @@
 define double @main(i224 %0) #0 {
 ; CHECK-LABEL: define double @main(
 ; CHECK-SAME: i224 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRYITOFP_ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP59:%.*]] = freeze i224 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i224 [[TMP59]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[ITOFP_RETURN:.*]], label %[[ITOFP_IF_END:.*]]
-; CHECK:       [[ITOFP_IF_END]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr i224 [[TMP59]], 223
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i224 [[TMP2]], [[TMP59]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i224 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i224 @llvm.ctlz.i224(i224 [[TMP4]], i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i224 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 224, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 223, [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 53
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[ITOFP_IF_THEN4:.*]], label %[[ITOFP_IF_ELSE:.*]]
-; CHECK:       [[ITOFP_IF_THEN4]]:
-; CHECK-NEXT:    switch i32 [[TMP7]], label %[[ITOFP_SW_DEFAULT:.*]] [
-; CHECK-NEXT:      i32 54, label %[[ITOFP_SW_BB:.*]]
-; CHECK-NEXT:      i32 55, label %[[ITOFP_SW_EPILOG:.*]]
-; CHECK-NEXT:    ]
-; CHECK:       [[ITOFP_SW_BB]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i224 [[TMP4]], 1
-; CHECK-NEXT:    br label %[[ITOFP_SW_EPILOG]]
-; CHECK:       [[ITOFP_SW_DEFAULT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 169, [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i224
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr i224 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 55
-; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i224
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr i224 -1, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and i224 [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i224 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i224
-; CHECK-NEXT:    [[TMP20:%.*]] = or i224 [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    br label %[[ITOFP_SW_EPILOG]]
-; CHECK:       [[ITOFP_SW_EPILOG]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i224 [ [[TMP20]], %[[ITOFP_SW_DEFAULT]] ], [ [[TMP4]], %[[ITOFP_IF_THEN4]] ], [ [[TMP10]], %[[ITOFP_SW_BB]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc i224 [[TMP21]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i224
-; CHECK-NEXT:    [[TMP26:%.*]] = or i224 [[TMP21]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = add i224 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = ashr i224 [[TMP27]], 2
-; CHECK-NEXT:    [[A3:%.*]] = and i224 [[TMP27]], 36028797018963968
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i224 [[A3]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = trunc i224 [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = lshr i224 [[TMP28]], 32
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc i224 [[TMP31]] to i32
-; CHECK-NEXT:    br i1 [[TMP29]], label %[[ITOFP_IF_END26:.*]], label %[[ITOFP_IF_THEN20:.*]]
-; CHECK:       [[ITOFP_IF_THEN20]]:
-; CHECK-NEXT:    [[TMP33:%.*]] = ashr i224 [[TMP27]], 3
-; CHECK-NEXT:    [[TMP34:%.*]] = trunc i224 [[TMP33]] to i64
-; CHECK-NEXT:    [[TMP35:%.*]] = lshr i224 [[TMP33]], 32
-; CHECK-NEXT:    [[TMP36:%.*]] = trunc i224 [[TMP35]] to i32
-; CHECK-NEXT:    br label %[[ITOFP_IF_END26]]
-; CHECK:       [[ITOFP_IF_ELSE]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -171
-; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i224
-; CHECK-NEXT:    [[TMP39:%.*]] = shl i224 [[TMP4]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = trunc i224 [[TMP39]] to i64
-; CHECK-NEXT:    [[TMP41:%.*]] = lshr i224 [[TMP39]], 32
-; CHECK-NEXT:    [[TMP42:%.*]] = trunc i224 [[TMP41]] to i32
-; CHECK-NEXT:    br label %[[ITOFP_IF_END26]]
-; CHECK:       [[ITOFP_IF_END26]]:
-; CHECK-NEXT:    [[TMP43:%.*]] = phi i64 [ [[TMP34]], %[[ITOFP_IF_THEN20]] ], [ [[TMP30]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP40]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP36]], %[[ITOFP_IF_THEN20]] ], [ [[TMP32]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP42]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP7]], %[[ITOFP_IF_THEN20]] ], [ [[TMP8]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP8]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT:    [[TMP46:%.*]] = trunc i224 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP47:%.*]] = and i32 [[TMP46]], -2147483648
-; CHECK-NEXT:    [[TMP48:%.*]] = shl i32 [[TMP45]], 20
-; CHECK-NEXT:    [[TMP49:%.*]] = add i32 [[TMP48]], 1072693248
-; CHECK-NEXT:    [[TMP50:%.*]] = and i32 [[TMP44]], 1048575
-; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP47]]
-; CHECK-NEXT:    [[TMP52:%.*]] = or i32 [[TMP51]], [[TMP49]]
-; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP52]] to i64
-; CHECK-NEXT:    [[TMP54:%.*]] = shl i64 [[TMP53]], 32
-; CHECK-NEXT:    [[TMP55:%.*]] = and i64 [[TMP43]], 4294967295
-; CHECK-NEXT:    [[TMP56:%.*]] = or i64 [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i64 [[TMP56]] to double
-; CHECK-NEXT:    br label %[[ITOFP_RETURN]]
-; CHECK:       [[ITOFP_RETURN]]:
-; CHECK-NEXT:    [[TMP58:%.*]] = phi double [ [[TMP57]], %[[ITOFP_IF_END26]] ], [ 0.000000e+00, %[[ENTRYITOFP_ENTRY]] ]
+; CHECK-NEXT:  [[ITOFP_RETURN:.*:]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sitofp i224 [[TMP0]] to double
 ; CHECK-NEXT:    ret double [[TMP58]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
index 751bdbade15d9..182073312504f 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
@@ -5,7 +5,7 @@
 define void @sdiv129(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @sdiv129(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -67,7 +67,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-NEXT:    [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]]
-; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
index 45491ccda2b19..6257ac24355a3 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
@@ -5,7 +5,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -70,7 +70,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-NEXT:    [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]]
-; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
index 6ad696ae446fd..9f96313d29891 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
@@ -5,7 +5,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0
@@ -56,7 +56,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
-; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
index a4c4ac2cba329..a20ffd2575d33 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
@@ -5,7 +5,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP1]]
@@ -60,7 +60,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]]
-; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr

>From 4829f992c80a837aca96389223a259a51cfe0823 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:39:47 +0100
Subject: [PATCH 07/17] [compiler-rt] Add __int256 builtins

Add 37 new runtime builtins for 256-bit integer operations, conditional
on CRT_HAS_256BIT (requires 64-bit pointers + compiler __int256 support):
- Arithmetic: multi5 (multiply), divmodoi4/udivmodoi4, div/mod/udiv/umod
- Shifts: ashloi3, ashroi3, lshroi3
- Comparisons: cmpoi2, ucmpoi2
- Bit operations: clzoi2, ctzoi2, ffsoi2, popcountoi2, parityoi2
- Overflow-checked: addvoi3, subvoi3, mulvoi3, absvoi2, negvoi2, negoi2
- Float conversions: fix/fixuns/float/floatun for sf/df/tf/xf

Extends int_types.h (oi_int = 256-bit), int_lib.h, fp_fixint_impl.inc,
int_to_fp.h, int_to_fp_impl.inc with generic 256-bit support.
CMakeLists.txt conditionally compiles on 64-bit targets.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 compiler-rt/lib/builtins/CMakeLists.txt     |  40 ++++++
 compiler-rt/lib/builtins/absvoi2.c          |  29 ++++
 compiler-rt/lib/builtins/addvoi3.c          |  33 +++++
 compiler-rt/lib/builtins/ashloi3.c          |  39 ++++++
 compiler-rt/lib/builtins/ashroi3.c          |  40 ++++++
 compiler-rt/lib/builtins/clzoi2.c           |  29 ++++
 compiler-rt/lib/builtins/cmpoi2.c           |  37 +++++
 compiler-rt/lib/builtins/ctzoi2.c           |  29 ++++
 compiler-rt/lib/builtins/divmodoi4.c        |  32 +++++
 compiler-rt/lib/builtins/divoi3.c           |  26 ++++
 compiler-rt/lib/builtins/ffsoi2.c           |  31 +++++
 compiler-rt/lib/builtins/fixdfoi.c          |  21 +++
 compiler-rt/lib/builtins/fixsfoi.c          |  21 +++
 compiler-rt/lib/builtins/fixtfoi.c          |  18 +++
 compiler-rt/lib/builtins/fixunsdfoi.c       |  17 +++
 compiler-rt/lib/builtins/fixunssfoi.c       |  17 +++
 compiler-rt/lib/builtins/fixunstfoi.c       |  17 +++
 compiler-rt/lib/builtins/fixunsxfoi.c       |  44 ++++++
 compiler-rt/lib/builtins/fixxfoi.c          |  46 ++++++
 compiler-rt/lib/builtins/floatoidf.c        |  23 +++
 compiler-rt/lib/builtins/floatoisf.c        |  23 +++
 compiler-rt/lib/builtins/floatoitf.c        |  26 ++++
 compiler-rt/lib/builtins/floatoixf.c        |  73 ++++++++++
 compiler-rt/lib/builtins/floatunoidf.c      |  23 +++
 compiler-rt/lib/builtins/floatunoisf.c      |  23 +++
 compiler-rt/lib/builtins/floatunoitf.c      |  26 ++++
 compiler-rt/lib/builtins/floatunoixf.c      |  70 ++++++++++
 compiler-rt/lib/builtins/fp_fixint_impl.inc |   4 +-
 compiler-rt/lib/builtins/int_lib.h          |   6 +
 compiler-rt/lib/builtins/int_to_fp.h        |  10 ++
 compiler-rt/lib/builtins/int_to_fp_impl.inc |   6 +
 compiler-rt/lib/builtins/int_types.h        |  50 +++++++
 compiler-rt/lib/builtins/lshroi3.c          |  38 +++++
 compiler-rt/lib/builtins/modoi3.c           |  26 ++++
 compiler-rt/lib/builtins/muloi5.c           |  29 ++++
 compiler-rt/lib/builtins/multi5.c           |  51 +++++++
 compiler-rt/lib/builtins/mulvoi3.c          |  27 ++++
 compiler-rt/lib/builtins/negoi2.c           |  25 ++++
 compiler-rt/lib/builtins/negvoi2.c          |  28 ++++
 compiler-rt/lib/builtins/parityoi2.c        |  36 +++++
 compiler-rt/lib/builtins/popcountoi2.c      |  27 ++++
 compiler-rt/lib/builtins/subvoi3.c          |  33 +++++
 compiler-rt/lib/builtins/ucmpoi2.c          |  37 +++++
 compiler-rt/lib/builtins/udivmodoi4.c       | 147 ++++++++++++++++++++
 compiler-rt/lib/builtins/udivoi3.c          |  23 +++
 compiler-rt/lib/builtins/umodoi3.c          |  25 ++++
 46 files changed, 1479 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/absvoi2.c
 create mode 100644 compiler-rt/lib/builtins/addvoi3.c
 create mode 100644 compiler-rt/lib/builtins/ashloi3.c
 create mode 100644 compiler-rt/lib/builtins/ashroi3.c
 create mode 100644 compiler-rt/lib/builtins/clzoi2.c
 create mode 100644 compiler-rt/lib/builtins/cmpoi2.c
 create mode 100644 compiler-rt/lib/builtins/ctzoi2.c
 create mode 100644 compiler-rt/lib/builtins/divmodoi4.c
 create mode 100644 compiler-rt/lib/builtins/divoi3.c
 create mode 100644 compiler-rt/lib/builtins/ffsoi2.c
 create mode 100644 compiler-rt/lib/builtins/fixdfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixsfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixtfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixunsdfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixunssfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixunstfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixunsxfoi.c
 create mode 100644 compiler-rt/lib/builtins/fixxfoi.c
 create mode 100644 compiler-rt/lib/builtins/floatoidf.c
 create mode 100644 compiler-rt/lib/builtins/floatoisf.c
 create mode 100644 compiler-rt/lib/builtins/floatoitf.c
 create mode 100644 compiler-rt/lib/builtins/floatoixf.c
 create mode 100644 compiler-rt/lib/builtins/floatunoidf.c
 create mode 100644 compiler-rt/lib/builtins/floatunoisf.c
 create mode 100644 compiler-rt/lib/builtins/floatunoitf.c
 create mode 100644 compiler-rt/lib/builtins/floatunoixf.c
 create mode 100644 compiler-rt/lib/builtins/lshroi3.c
 create mode 100644 compiler-rt/lib/builtins/modoi3.c
 create mode 100644 compiler-rt/lib/builtins/muloi5.c
 create mode 100644 compiler-rt/lib/builtins/multi5.c
 create mode 100644 compiler-rt/lib/builtins/mulvoi3.c
 create mode 100644 compiler-rt/lib/builtins/negoi2.c
 create mode 100644 compiler-rt/lib/builtins/negvoi2.c
 create mode 100644 compiler-rt/lib/builtins/parityoi2.c
 create mode 100644 compiler-rt/lib/builtins/popcountoi2.c
 create mode 100644 compiler-rt/lib/builtins/subvoi3.c
 create mode 100644 compiler-rt/lib/builtins/ucmpoi2.c
 create mode 100644 compiler-rt/lib/builtins/udivmodoi4.c
 create mode 100644 compiler-rt/lib/builtins/udivoi3.c
 create mode 100644 compiler-rt/lib/builtins/umodoi3.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f43ef4743ff97..d1b19fd375fb3 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -72,74 +72,94 @@ include_directories(../../../third-party/siphash/include)
 set(GENERIC_SOURCES
   absvdi2.c
   absvsi2.c
+  absvoi2.c
   absvti2.c
   adddf3.c
   addsf3.c
   addvdi3.c
   addvsi3.c
+  addvoi3.c
   addvti3.c
   apple_versioning.c
   ashldi3.c
+  ashloi3.c
   ashlti3.c
   ashrdi3.c
+  ashroi3.c
   ashrti3.c
   bswapdi2.c
   bswapsi2.c
   clzdi2.c
   clzsi2.c
+  clzoi2.c
   clzti2.c
   cmpdi2.c
+  cmpoi2.c
   cmpti2.c
   comparedf2.c
   comparesf2.c
   ctzdi2.c
   ctzsi2.c
+  ctzoi2.c
   ctzti2.c
   divdc3.c
   divdf3.c
   divdi3.c
   divmoddi4.c
   divmodsi4.c
+  divmodoi4.c
   divmodti4.c
   divsc3.c
   divsf3.c
   divsi3.c
+  divoi3.c
   divti3.c
   extendsfdf2.c
   extendhfsf2.c
   extendhfdf2.c
   ffsdi2.c
   ffssi2.c
+  ffsoi2.c
   ffsti2.c
   fixdfdi.c
   fixdfsi.c
+  fixdfoi.c
   fixdfti.c
   fixsfdi.c
+  fixsfoi.c
   fixsfsi.c
   fixsfti.c
   fixunsdfdi.c
+  fixunsdfoi.c
   fixunsdfsi.c
   fixunsdfti.c
   fixunssfdi.c
+  fixunssfoi.c
   fixunssfsi.c
   fixunssfti.c
   floatdidf.c
   floatdisf.c
   floatsidf.c
   floatsisf.c
+  floatoidf.c
+  floatoisf.c
   floattidf.c
   floattisf.c
   floatundidf.c
   floatundisf.c
   floatunsidf.c
   floatunsisf.c
+  floatunoidf.c
+  floatunoisf.c
   floatuntidf.c
   floatuntisf.c
   fp_mode.c
   int_util.c
   lshrdi3.c
+  lshroi3.c
   lshrti3.c
   moddi3.c
+  modoi3.c
   modsi3.c
   modti3.c
   muldc3.c
@@ -147,25 +167,32 @@ set(GENERIC_SOURCES
   muldi3.c
   mulodi4.c
   mulosi4.c
+  muloi5.c
   muloti4.c
   mulsc3.c
   mulsf3.c
   multi3.c
+  multi5.c
   mulvdi3.c
+  mulvoi3.c
   mulvsi3.c
   mulvti3.c
   negdf2.c
   negdi2.c
   negsf2.c
+  negoi2.c
   negti2.c
   negvdi2.c
+  negvoi2.c
   negvsi2.c
   negvti2.c
   os_version_check.c
   paritydi2.c
   paritysi2.c
+  parityoi2.c
   parityti2.c
   popcountdi2.c
+  popcountoi2.c
   popcountsi2.c
   popcountti2.c
   powidf2.c
@@ -174,20 +201,25 @@ set(GENERIC_SOURCES
   subsf3.c
   subvdi3.c
   subvsi3.c
+  subvoi3.c
   subvti3.c
   trampoline_setup.c
   truncdfhf2.c
   truncdfsf2.c
   truncsfhf2.c
   ucmpdi2.c
+  ucmpoi2.c
   ucmpti2.c
   udivdi3.c
   udivmoddi4.c
+  udivmodoi4.c
   udivmodsi4.c
   udivmodti4.c
+  udivoi3.c
   udivsi3.c
   udivti3.c
   umoddi3.c
+  umodoi3.c
   umodsi3.c
   umodti3.c
 )
@@ -211,14 +243,18 @@ set(GENERIC_TF_SOURCES
   extendsftf2.c
   fixtfdi.c
   fixtfsi.c
+  fixtfoi.c
   fixtfti.c
   fixunstfdi.c
+  fixunstfoi.c
   fixunstfsi.c
   fixunstfti.c
   floatditf.c
+  floatoitf.c
   floatsitf.c
   floattitf.c
   floatunditf.c
+  floatunoitf.c
   floatunsitf.c
   floatuntitf.c
   multc3.c
@@ -306,13 +342,17 @@ set(x86_80_BIT_SOURCES
   extendhfxf2.c
   extendxftf2.c
   fixxfdi.c
+  fixxfoi.c
   fixxfti.c
   fixunsxfdi.c
+  fixunsxfoi.c
   fixunsxfsi.c
   fixunsxfti.c
   floatdixf.c
+  floatoixf.c
   floattixf.c
   floatundixf.c
+  floatunoixf.c
   floatuntixf.c
   mulxc3.c
   powixf2.c
diff --git a/compiler-rt/lib/builtins/absvoi2.c b/compiler-rt/lib/builtins/absvoi2.c
new file mode 100644
index 0000000000000..1cc6c8a47731b
--- /dev/null
+++ b/compiler-rt/lib/builtins/absvoi2.c
@@ -0,0 +1,29 @@
+//===-- absvoi2.c - Implement __absvoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __absvoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: absolute value
+
+// Effects: aborts if abs(x) < 0
+
+COMPILER_RT_ABI oi_int __absvoi2(oi_int a) {
+  const int N = (int)(sizeof(oi_int) * CHAR_BIT);
+  if (a == (oi_int)((ou_int)1 << (N - 1)))
+    compilerrt_abort();
+  const oi_int s = a >> (N - 1);
+  return (a ^ s) - s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/addvoi3.c b/compiler-rt/lib/builtins/addvoi3.c
new file mode 100644
index 0000000000000..8000c6b9104a5
--- /dev/null
+++ b/compiler-rt/lib/builtins/addvoi3.c
@@ -0,0 +1,33 @@
+//===-- addvoi3.c - Implement __addvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __addvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a + b
+
+// Effects: aborts if a + b overflows
+
+COMPILER_RT_ABI oi_int __addvoi3(oi_int a, oi_int b) {
+  oi_int s = (ou_int)a + (ou_int)b;
+  if (b >= 0) {
+    if (s < a)
+      compilerrt_abort();
+  } else {
+    if (s >= a)
+      compilerrt_abort();
+  }
+  return s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ashloi3.c b/compiler-rt/lib/builtins/ashloi3.c
new file mode 100644
index 0000000000000..9d81628403ab7
--- /dev/null
+++ b/compiler-rt/lib/builtins/ashloi3.c
@@ -0,0 +1,39 @@
+//===-- ashloi3.c - Implement __ashloi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ashloi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a << b
+
+// Precondition:  0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashloi3(oi_int a, int b) {
+  const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+  owords input;
+  owords result;
+  input.all = a;
+  if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+    result.s.low = 0;
+    result.s.high = input.s.low << (b - bits_in_tword);
+  } else /* 0 <= b < bits_in_tword */ {
+    if (b == 0)
+      return a;
+    result.s.low = input.s.low << b;
+    result.s.high =
+        ((tu_int)input.s.high << b) | (input.s.low >> (bits_in_tword - b));
+  }
+  return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ashroi3.c b/compiler-rt/lib/builtins/ashroi3.c
new file mode 100644
index 0000000000000..35b583d47f7cb
--- /dev/null
+++ b/compiler-rt/lib/builtins/ashroi3.c
@@ -0,0 +1,40 @@
+//===-- ashroi3.c - Implement __ashroi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ashroi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: arithmetic a >> b
+
+// Precondition:  0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashroi3(oi_int a, int b) {
+  const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+  owords input;
+  owords result;
+  input.all = a;
+  if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+    // result.s.high = input.s.high < 0 ? -1 : 0
+    result.s.high = input.s.high >> (bits_in_tword - 1);
+    result.s.low = input.s.high >> (b - bits_in_tword);
+  } else /* 0 <= b < bits_in_tword */ {
+    if (b == 0)
+      return a;
+    result.s.high = input.s.high >> b;
+    result.s.low =
+        ((tu_int)input.s.high << (bits_in_tword - b)) | (input.s.low >> b);
+  }
+  return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/clzoi2.c b/compiler-rt/lib/builtins/clzoi2.c
new file mode 100644
index 0000000000000..11a61d1034129
--- /dev/null
+++ b/compiler-rt/lib/builtins/clzoi2.c
@@ -0,0 +1,29 @@
+//===-- clzoi2.c - Implement __clzoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __clzoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the number of leading 0-bits
+
+// Precondition: a != 0
+
+COMPILER_RT_ABI int __clzoi2(oi_int a) {
+  owords x;
+  x.all = a;
+  const ti_int f = -(x.s.high == 0);
+  return __clzti2((x.s.high & ~f) | (x.s.low & f)) +
+         ((si_int)f & ((si_int)(sizeof(ti_int) * CHAR_BIT)));
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/cmpoi2.c b/compiler-rt/lib/builtins/cmpoi2.c
new file mode 100644
index 0000000000000..ba16733eda25c
--- /dev/null
+++ b/compiler-rt/lib/builtins/cmpoi2.c
@@ -0,0 +1,37 @@
+//===-- cmpoi2.c - Implement __cmpoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __cmpoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns:  if (a <  b) returns 0
+//           if (a == b) returns 1
+//           if (a >  b) returns 2
+
+COMPILER_RT_ABI si_int __cmpoi2(oi_int a, oi_int b) {
+  owords x;
+  x.all = a;
+  owords y;
+  y.all = b;
+  if (x.s.high < y.s.high)
+    return 0;
+  if (x.s.high > y.s.high)
+    return 2;
+  if (x.s.low < y.s.low)
+    return 0;
+  if (x.s.low > y.s.low)
+    return 2;
+  return 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ctzoi2.c b/compiler-rt/lib/builtins/ctzoi2.c
new file mode 100644
index 0000000000000..b477aa01b31a8
--- /dev/null
+++ b/compiler-rt/lib/builtins/ctzoi2.c
@@ -0,0 +1,29 @@
+//===-- ctzoi2.c - Implement __ctzoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ctzoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the number of trailing 0-bits
+
+// Precondition: a != 0
+
+COMPILER_RT_ABI int __ctzoi2(oi_int a) {
+  owords x;
+  x.all = a;
+  if (x.s.low != 0)
+    return __ctzti2(x.s.low);
+  return __ctzti2(x.s.high) + (int)(sizeof(ti_int) * CHAR_BIT);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/divmodoi4.c b/compiler-rt/lib/builtins/divmodoi4.c
new file mode 100644
index 0000000000000..450dcaecf0720
--- /dev/null
+++ b/compiler-rt/lib/builtins/divmodoi4.c
@@ -0,0 +1,32 @@
+//===-- divmodoi4.c - Implement __divmodoi4 -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __divmodoi4 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b, *rem = a % b
+
+COMPILER_RT_ABI oi_int __divmodoi4(oi_int a, oi_int b, oi_int *rem) {
+  const int bits_in_oword_m1 = (int)(sizeof(oi_int) * CHAR_BIT) - 1;
+  oi_int s_a = a >> bits_in_oword_m1; // s_a = a < 0 ? -1 : 0
+  oi_int s_b = b >> bits_in_oword_m1; // s_b = b < 0 ? -1 : 0
+  a = (ou_int)(a ^ s_a) - s_a;        // negate if s_a == -1
+  b = (ou_int)(b ^ s_b) - s_b;        // negate if s_b == -1
+  s_b ^= s_a;                         // sign of quotient
+  ou_int r;
+  oi_int q = (__udivmodoi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1
+  *rem = (r ^ s_a) - s_a;                          // negate if s_a == -1
+  return q;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/divoi3.c b/compiler-rt/lib/builtins/divoi3.c
new file mode 100644
index 0000000000000..2132b578cdecd
--- /dev/null
+++ b/compiler-rt/lib/builtins/divoi3.c
@@ -0,0 +1,26 @@
+//===-- divoi3.c - Implement __divoi3 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __divoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#define COMPUTE_UDIV(a, b) __udivmodoi4((a), (b), (ou_int *)0)
+#include "int_div_impl.inc"
+
+COMPILER_RT_ABI oi_int __divoi3(oi_int a, oi_int b) { return __divXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ffsoi2.c b/compiler-rt/lib/builtins/ffsoi2.c
new file mode 100644
index 0000000000000..2378ee305ec6a
--- /dev/null
+++ b/compiler-rt/lib/builtins/ffsoi2.c
@@ -0,0 +1,31 @@
+//===-- ffsoi2.c - Implement __ffsoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ffsoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the index of the least significant 1-bit in a, or
+// the value zero if a is zero. The least significant bit is index one.
+
+COMPILER_RT_ABI int __ffsoi2(oi_int a) {
+  owords x;
+  x.all = a;
+  if (x.s.low == 0) {
+    if (x.s.high == 0)
+      return 0;
+    return __ctzti2(x.s.high) + (1 + sizeof(ti_int) * CHAR_BIT);
+  }
+  return __ctzti2(x.s.low) + 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixdfoi.c b/compiler-rt/lib/builtins/fixdfoi.c
new file mode 100644
index 0000000000000..cfe45b63cb520
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixdfoi.c
@@ -0,0 +1,21 @@
+//===-- fixdfoi.c - Implement __fixdfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+#define DOUBLE_PRECISION
+#include "fp_lib.h"
+
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixdfoi(fp_t a) { return __fixint(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixsfoi.c b/compiler-rt/lib/builtins/fixsfoi.c
new file mode 100644
index 0000000000000..2c67dee2bb206
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixsfoi.c
@@ -0,0 +1,21 @@
+//===-- fixsfoi.c - Implement __fixsfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+#define SINGLE_PRECISION
+#include "fp_lib.h"
+
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixsfoi(fp_t a) { return __fixint(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixtfoi.c b/compiler-rt/lib/builtins/fixtfoi.c
new file mode 100644
index 0000000000000..7edab043c4c40
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixtfoi.c
@@ -0,0 +1,18 @@
+//===-- fixtfoi.c - Implement __fixtfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixtfoi(fp_t a) { return __fixint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunsdfoi.c b/compiler-rt/lib/builtins/fixunsdfoi.c
new file mode 100644
index 0000000000000..86066cd83e674
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunsdfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunsdfoi.c - Implement __fixunsdfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DOUBLE_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunsdfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunssfoi.c b/compiler-rt/lib/builtins/fixunssfoi.c
new file mode 100644
index 0000000000000..069dc584ea18b
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunssfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunssfoi.c - Implement __fixunssfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SINGLE_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunssfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunstfoi.c b/compiler-rt/lib/builtins/fixunstfoi.c
new file mode 100644
index 0000000000000..00c9aff080a70
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunstfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunstfoi.c - Implement __fixunstfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunstfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunsxfoi.c b/compiler-rt/lib/builtins/fixunsxfoi.c
new file mode 100644
index 0000000000000..2297f9d3dc335
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunsxfoi.c
@@ -0,0 +1,44 @@
+//===-- fixunsxfoi.c - Implement __fixunsxfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __fixunsxfoi for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to an unsigned 256-bit integer, rounding toward zero.
+//          Negative values all become zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes ou_int is a 256 bit integral type value in long double is representable
+// in ou_int or is negative
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI ou_int __fixunsxfoi(xf_float a) {
+  xf_bits fb;
+  fb.f = a;
+  int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
+  if (e < 0 || (fb.u.high.s.low & 0x00008000))
+    return 0;
+  if ((unsigned)e > sizeof(ou_int) * CHAR_BIT)
+    return ~(ou_int)0;
+  ou_int r = fb.u.low.all;
+  if (e > 63)
+    r <<= (e - 63);
+  else
+    r >>= (63 - e);
+  return r;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixxfoi.c b/compiler-rt/lib/builtins/fixxfoi.c
new file mode 100644
index 0000000000000..3a1003728be92
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixxfoi.c
@@ -0,0 +1,46 @@
+//===-- fixxfoi.c - Implement __fixxfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __fixxfoi for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a signed 256-bit integer, rounding toward zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes oi_int is a 256 bit integral type value in long double is representable
+// in oi_int
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI oi_int __fixxfoi(xf_float a) {
+  const oi_int oi_max = (oi_int)((~(ou_int)0) / 2);
+  const oi_int oi_min = -oi_max - 1;
+  xf_bits fb;
+  fb.f = a;
+  int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
+  if (e < 0)
+    return 0;
+  oi_int s = -(si_int)((fb.u.high.s.low & 0x00008000) >> 15);
+  oi_int r = fb.u.low.all;
+  if ((unsigned)e >= sizeof(oi_int) * CHAR_BIT)
+    return a > 0 ? oi_max : oi_min;
+  if (e > 63)
+    r <<= (e - 63);
+  else
+    r >>= (63 - e);
+  return (r ^ s) - s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoidf.c b/compiler-rt/lib/builtins/floatoidf.c
new file mode 100644
index 0000000000000..89cc399b061fe
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoidf.c
@@ -0,0 +1,23 @@
+//===-- floatoidf.c - Implement __floatoidf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoidf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_I256
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatoidf(oi_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoisf.c b/compiler-rt/lib/builtins/floatoisf.c
new file mode 100644
index 0000000000000..3efaa1ebbeec4
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoisf.c
@@ -0,0 +1,23 @@
+//===-- floatoisf.c - Implement __floatoisf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoisf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_I256
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatoisf(oi_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoitf.c b/compiler-rt/lib/builtins/floatoitf.c
new file mode 100644
index 0000000000000..5ca149d73c966
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoitf.c
@@ -0,0 +1,26 @@
+//===-- floatoitf.c - int256 -> quad-precision conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements oi_int to quad-precision conversion for the
+// compiler-rt library in the IEEE-754 default round-to-nearest, ties-to-even
+// mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+#define SRC_I256
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI fp_t __floatoitf(oi_int a) { return __floatXiYf__(a); }
+
+#endif
diff --git a/compiler-rt/lib/builtins/floatoixf.c b/compiler-rt/lib/builtins/floatoixf.c
new file mode 100644
index 0000000000000..253a89847c401
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoixf.c
@@ -0,0 +1,73 @@
+//===-- floatoixf.c - Implement __floatoixf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoixf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits oi_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI xf_float __floatoixf(oi_int a) {
+  if (a == 0)
+    return 0.0;
+  const unsigned N = sizeof(oi_int) * CHAR_BIT;
+  const oi_int s = a >> (N - 1);
+  a = (a ^ s) - s;
+  int sd = N - __clzoi2(a); // number of significant digits
+  int e = sd - 1;           // exponent
+  if (sd > LDBL_MANT_DIG) {
+    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+    //                                                12345678901234567890123456
+    //  1 = msb 1 bit
+    //  P = bit LDBL_MANT_DIG-1 bits to the right of 1
+    //  Q = bit LDBL_MANT_DIG bits to the right of 1
+    //  R = "or" of all bits to the right of Q
+    switch (sd) {
+    case LDBL_MANT_DIG + 1:
+      a <<= 1;
+      break;
+    case LDBL_MANT_DIG + 2:
+      break;
+    default:
+      a = ((ou_int)a >> (sd - (LDBL_MANT_DIG + 2))) |
+          ((a & ((ou_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
+    };
+    // finish:
+    a |= (a & 4) != 0; // Or P into R
+    ++a;               // round - this step may add a significant bit
+    a >>= 2;           // dump Q and R
+    // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
+    if (a & ((ou_int)1 << LDBL_MANT_DIG)) {
+      a >>= 1;
+      ++e;
+    }
+    // a is now rounded to LDBL_MANT_DIG bits
+  } else {
+    a <<= (LDBL_MANT_DIG - sd);
+    // a is now rounded to LDBL_MANT_DIG bits
+  }
+  xf_bits fb;
+  fb.u.high.s.low = ((su_int)s & 0x8000) | // sign
+                    (e + 16383);           // exponent
+  fb.u.low.all = (du_int)a;                // mantissa
+  return fb.f;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoidf.c b/compiler-rt/lib/builtins/floatunoidf.c
new file mode 100644
index 0000000000000..22eb74d08bfd8
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoidf.c
@@ -0,0 +1,23 @@
+//===-- floatunoidf.c - Implement __floatunoidf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoidf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_U256
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatunoidf(ou_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoisf.c b/compiler-rt/lib/builtins/floatunoisf.c
new file mode 100644
index 0000000000000..e0d13f4baee2d
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoisf.c
@@ -0,0 +1,23 @@
+//===-- floatunoisf.c - Implement __floatunoisf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoisf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_U256
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatunoisf(ou_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoitf.c b/compiler-rt/lib/builtins/floatunoitf.c
new file mode 100644
index 0000000000000..d4a8de96b517e
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoitf.c
@@ -0,0 +1,26 @@
+//===-- floatunoitf.c - uint256 -> quad-precision conversion ------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ou_int to quad-precision conversion for the
+// compiler-rt library in the IEEE-754 default round-to-nearest, ties-to-even
+// mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+#define SRC_U256
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI fp_t __floatunoitf(ou_int a) { return __floatXiYf__(a); }
+
+#endif
diff --git a/compiler-rt/lib/builtins/floatunoixf.c b/compiler-rt/lib/builtins/floatunoixf.c
new file mode 100644
index 0000000000000..49b15ca7e242c
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoixf.c
@@ -0,0 +1,70 @@
+//===-- floatunoixf.c - Implement __floatunoixf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoixf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits ou_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI xf_float __floatunoixf(ou_int a) {
+  if (a == 0)
+    return 0.0;
+  const unsigned N = sizeof(ou_int) * CHAR_BIT;
+  int sd = N - __clzoi2(a); // number of significant digits
+  int e = sd - 1;           // exponent
+  if (sd > LDBL_MANT_DIG) {
+    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+    //                                                12345678901234567890123456
+    //  1 = msb 1 bit
+    //  P = bit LDBL_MANT_DIG-1 bits to the right of 1
+    //  Q = bit LDBL_MANT_DIG bits to the right of 1
+    //  R = "or" of all bits to the right of Q
+    switch (sd) {
+    case LDBL_MANT_DIG + 1:
+      a <<= 1;
+      break;
+    case LDBL_MANT_DIG + 2:
+      break;
+    default:
+      a = (a >> (sd - (LDBL_MANT_DIG + 2))) |
+          ((a & ((ou_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
+    };
+    // finish:
+    a |= (a & 4) != 0; // Or P into R
+    ++a;               // round - this step may add a significant bit
+    a >>= 2;           // dump Q and R
+    // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
+    if (a & ((ou_int)1 << LDBL_MANT_DIG)) {
+      a >>= 1;
+      ++e;
+    }
+    // a is now rounded to LDBL_MANT_DIG bits
+  } else {
+    a <<= (LDBL_MANT_DIG - sd);
+    // a is now rounded to LDBL_MANT_DIG bits
+  }
+  xf_bits fb;
+  fb.u.high.s.low = (e + 16383); // exponent
+  fb.u.low.all = (du_int)a;      // mantissa
+  return fb.f;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fp_fixint_impl.inc b/compiler-rt/lib/builtins/fp_fixint_impl.inc
index 2f2f77ce781ae..245b29b7ba7ab 100644
--- a/compiler-rt/lib/builtins/fp_fixint_impl.inc
+++ b/compiler-rt/lib/builtins/fp_fixint_impl.inc
@@ -27,8 +27,8 @@ static __inline fixint_t __fixint(fp_t a) {
   if (exponent < 0)
     return 0;
 
-  // If the value is too large for the integer type, saturate.
-  if ((unsigned)exponent >= sizeof(fixint_t) * CHAR_BIT)
+  // If the value is too large for the integer type, or is inf/NaN, saturate.
+  if ((unsigned)exponent >= sizeof(fixint_t) * CHAR_BIT || aAbs >= infRep)
     return sign == 1 ? fixint_max : fixint_min;
 
   // If 0 <= exponent < significandBits, right shift to get the result.
diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h
index 943430de259d8..79271308a448b 100644
--- a/compiler-rt/lib/builtins/int_lib.h
+++ b/compiler-rt/lib/builtins/int_lib.h
@@ -112,9 +112,15 @@ COMPILER_RT_ABI su_int __udivmodsi4(su_int a, su_int b, su_int *rem);
 COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem);
 #ifdef CRT_HAS_128BIT
 COMPILER_RT_ABI int __clzti2(ti_int a);
+COMPILER_RT_ABI int __ctzti2(ti_int a);
 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
 #endif
 
+#ifdef CRT_HAS_256BIT
+COMPILER_RT_ABI int __clzoi2(oi_int a);
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem);
+#endif
+
 // Definitions for builtins unavailable on MSVC
 #if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>
diff --git a/compiler-rt/lib/builtins/int_to_fp.h b/compiler-rt/lib/builtins/int_to_fp.h
index 2c1218f1e89c4..3393b2c0f4aaa 100644
--- a/compiler-rt/lib/builtins/int_to_fp.h
+++ b/compiler-rt/lib/builtins/int_to_fp.h
@@ -36,6 +36,16 @@ typedef __uint128_t src_t;
 typedef __uint128_t usrc_t;
 static __inline int clzSrcT(usrc_t x) { return __clzti2(x); }
 
+#elif defined SRC_I256
+typedef __int256_t src_t;
+typedef __uint256_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzoi2(x); }
+
+#elif defined SRC_U256
+typedef __uint256_t src_t;
+typedef __uint256_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzoi2(x); }
+
 #else
 #error Source should be a handled integer type.
 #endif
diff --git a/compiler-rt/lib/builtins/int_to_fp_impl.inc b/compiler-rt/lib/builtins/int_to_fp_impl.inc
index 11736ed7aafc8..91eb668de9eb7 100644
--- a/compiler-rt/lib/builtins/int_to_fp_impl.inc
+++ b/compiler-rt/lib/builtins/int_to_fp_impl.inc
@@ -63,7 +63,13 @@ static __inline dst_t __floatXiYf__(src_t a) {
   const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
   const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
+  const int dstExpMax = (1 << dstExpBits) - 1;
   const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
+  // If the exponent exceeds the destination's range, return infinity.
+  if (e + dstExpBias >= dstExpMax) {
+    return dstFromRep(((dst_rep_t)s & dstSignMask) |
+                      ((dst_rep_t)dstExpMax << dstSigBits));
+  }
   // Combine sign, exponent, and mantissa.
   const dst_rep_t result = ((dst_rep_t)s & dstSignMask) |
                            ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 7c7f8cb64aa9a..6420ef0f65e84 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -121,6 +121,56 @@ static __inline tu_int make_tu(du_int h, du_int l) {
 
 #endif // CRT_HAS_128BIT
 
+#if defined(__SIZEOF_INT256__)
+#define CRT_HAS_256BIT
+#endif
+
+#ifdef CRT_HAS_256BIT
+typedef __int256_t oi_int;
+typedef __uint256_t ou_int;
+
+typedef union {
+  oi_int all;
+  struct {
+#if _YUGA_LITTLE_ENDIAN
+    tu_int low;
+    ti_int high;
+#else
+    ti_int high;
+    tu_int low;
+#endif // _YUGA_LITTLE_ENDIAN
+  } s;
+} owords;
+
+typedef union {
+  ou_int all;
+  struct {
+#if _YUGA_LITTLE_ENDIAN
+    tu_int low;
+    tu_int high;
+#else
+    tu_int high;
+    tu_int low;
+#endif // _YUGA_LITTLE_ENDIAN
+  } s;
+} uowords;
+
+static __inline oi_int make_oi(ti_int h, ti_int l) {
+  owords r;
+  r.s.high = (tu_int)h;
+  r.s.low = (tu_int)l;
+  return r.all;
+}
+
+static __inline ou_int make_ou(tu_int h, tu_int l) {
+  uowords r;
+  r.s.high = h;
+  r.s.low = l;
+  return r.all;
+}
+
+#endif // CRT_HAS_256BIT
+
 // FreeBSD's boot environment does not support using floating-point and poisons
 // the float and double keywords.
 #if defined(__FreeBSD__) && defined(_STANDALONE)
diff --git a/compiler-rt/lib/builtins/lshroi3.c b/compiler-rt/lib/builtins/lshroi3.c
new file mode 100644
index 0000000000000..d4e4920bda0a1
--- /dev/null
+++ b/compiler-rt/lib/builtins/lshroi3.c
@@ -0,0 +1,38 @@
+//===-- lshroi3.c - Implement __lshroi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __lshroi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: logical a >> b
+
+// Precondition:  0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __lshroi3(oi_int a, int b) {
+  const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+  uowords input;
+  uowords result;
+  input.all = a;
+  if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+    result.s.high = 0;
+    result.s.low = input.s.high >> (b - bits_in_tword);
+  } else /* 0 <= b < bits_in_tword */ {
+    if (b == 0)
+      return a;
+    result.s.high = input.s.high >> b;
+    result.s.low = (input.s.high << (bits_in_tword - b)) | (input.s.low >> b);
+  }
+  return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/modoi3.c b/compiler-rt/lib/builtins/modoi3.c
new file mode 100644
index 0000000000000..117a419019a36
--- /dev/null
+++ b/compiler-rt/lib/builtins/modoi3.c
@@ -0,0 +1,26 @@
+//===-- modoi3.c - Implement __modoi3 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __modoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a % b
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#define ASSIGN_UMOD(res, a, b) __udivmodoi4((a), (b), &(res))
+#include "int_div_impl.inc"
+
+COMPILER_RT_ABI oi_int __modoi3(oi_int a, oi_int b) { return __modXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/muloi5.c b/compiler-rt/lib/builtins/muloi5.c
new file mode 100644
index 0000000000000..60f3831aac959
--- /dev/null
+++ b/compiler-rt/lib/builtins/muloi5.c
@@ -0,0 +1,29 @@
+//===-- muloi5.c - Implement __muloi5 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __muloi5 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+// Effects: sets *overflow to 1  if a * b overflows
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#include "int_mulo_impl.inc"
+
+COMPILER_RT_ABI oi_int __muloi5(oi_int a, oi_int b, int *overflow) {
+  return __muloXi4(a, b, overflow);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/multi5.c b/compiler-rt/lib/builtins/multi5.c
new file mode 100644
index 0000000000000..9172895b7ebf7
--- /dev/null
+++ b/compiler-rt/lib/builtins/multi5.c
@@ -0,0 +1,51 @@
+//===-- multi5.c - Implement __multi5 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __multi5 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+static oi_int __multti3(tu_int a, tu_int b) {
+  owords r;
+  const int bits_in_tword_2 = (int)(sizeof(ti_int) * CHAR_BIT) / 2;
+  const tu_int lower_mask = (tu_int)~0 >> bits_in_tword_2;
+  r.s.low = (a & lower_mask) * (b & lower_mask);
+  tu_int t = (tu_int)r.s.low >> bits_in_tword_2;
+  r.s.low &= lower_mask;
+  t += (a >> bits_in_tword_2) * (b & lower_mask);
+  r.s.low += (t & lower_mask) << bits_in_tword_2;
+  r.s.high = t >> bits_in_tword_2;
+  t = (tu_int)r.s.low >> bits_in_tword_2;
+  r.s.low &= lower_mask;
+  t += (b >> bits_in_tword_2) * (a & lower_mask);
+  r.s.low += (t & lower_mask) << bits_in_tword_2;
+  r.s.high += t >> bits_in_tword_2;
+  r.s.high += (a >> bits_in_tword_2) * (b >> bits_in_tword_2);
+  return r.all;
+}
+
+// Returns: a * b
+
+COMPILER_RT_ABI oi_int __multi5(oi_int a, oi_int b) {
+  owords x;
+  x.all = a;
+  owords y;
+  y.all = b;
+  owords r;
+  r.all = __multti3(x.s.low, y.s.low);
+  r.s.high += x.s.high * y.s.low + x.s.low * y.s.high;
+  return r.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/mulvoi3.c b/compiler-rt/lib/builtins/mulvoi3.c
new file mode 100644
index 0000000000000..1ec46d45e7eff
--- /dev/null
+++ b/compiler-rt/lib/builtins/mulvoi3.c
@@ -0,0 +1,27 @@
+//===-- mulvoi3.c - Implement __mulvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __mulvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+// Effects: aborts if a * b overflows
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#include "int_mulv_impl.inc"
+
+COMPILER_RT_ABI oi_int __mulvoi3(oi_int a, oi_int b) { return __mulvXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/negoi2.c b/compiler-rt/lib/builtins/negoi2.c
new file mode 100644
index 0000000000000..ae46825fd7416
--- /dev/null
+++ b/compiler-rt/lib/builtins/negoi2.c
@@ -0,0 +1,25 @@
+//===-- negoi2.c - Implement __negoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __negoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: -a
+
+COMPILER_RT_ABI oi_int __negoi2(oi_int a) {
+  // Note: this routine is here for API compatibility; any sane compiler
+  // should expand it inline.
+  return -(ou_int)a;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/negvoi2.c b/compiler-rt/lib/builtins/negvoi2.c
new file mode 100644
index 0000000000000..07d29b6480a68
--- /dev/null
+++ b/compiler-rt/lib/builtins/negvoi2.c
@@ -0,0 +1,28 @@
+//===-- negvoi2.c - Implement __negvoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __negvoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: -a
+
+// Effects: aborts if -a overflows
+
+COMPILER_RT_ABI oi_int __negvoi2(oi_int a) {
+  const oi_int MIN = (ou_int)1 << ((int)(sizeof(oi_int) * CHAR_BIT) - 1);
+  if (a == MIN)
+    compilerrt_abort();
+  return -a;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/parityoi2.c b/compiler-rt/lib/builtins/parityoi2.c
new file mode 100644
index 0000000000000..88ca0791a8a98
--- /dev/null
+++ b/compiler-rt/lib/builtins/parityoi2.c
@@ -0,0 +1,36 @@
+//===-- parityoi2.c - Implement __parityoi2 -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __parityoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: 1 if number of bits is odd else returns 0
+
+COMPILER_RT_ABI int __parityoi2(oi_int a) {
+  owords x;
+  x.all = a;
+  // XOR the two 128-bit halves, then delegate to parityti2's approach.
+  tu_int x2 = x.s.high ^ x.s.low;
+  // XOR the two 64-bit halves of the 128-bit result.
+  dwords x3;
+  utwords t;
+  t.all = x2;
+  x3.all = t.s.high ^ t.s.low;
+  su_int x4 = x3.s.high ^ x3.s.low;
+  x4 ^= x4 >> 16;
+  x4 ^= x4 >> 8;
+  x4 ^= x4 >> 4;
+  return (0x6996 >> (x4 & 0xF)) & 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/popcountoi2.c b/compiler-rt/lib/builtins/popcountoi2.c
new file mode 100644
index 0000000000000..2052c3b26c04b
--- /dev/null
+++ b/compiler-rt/lib/builtins/popcountoi2.c
@@ -0,0 +1,27 @@
+//===-- popcountoi2.c - Implement __popcountoi2 ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __popcountoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __popcountti2(ti_int a);
+
+// Returns: count of 1 bits
+
+COMPILER_RT_ABI int __popcountoi2(oi_int a) {
+  uowords x;
+  x.all = (ou_int)a;
+  return __popcountti2(x.s.low) + __popcountti2(x.s.high);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/subvoi3.c b/compiler-rt/lib/builtins/subvoi3.c
new file mode 100644
index 0000000000000..b4c64c0d9dcf5
--- /dev/null
+++ b/compiler-rt/lib/builtins/subvoi3.c
@@ -0,0 +1,33 @@
+//===-- subvoi3.c - Implement __subvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __subvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a - b
+
+// Effects: aborts if a - b overflows
+
+COMPILER_RT_ABI oi_int __subvoi3(oi_int a, oi_int b) {
+  oi_int s = (ou_int)a - (ou_int)b;
+  if (b >= 0) {
+    if (s > a)
+      compilerrt_abort();
+  } else {
+    if (s <= a)
+      compilerrt_abort();
+  }
+  return s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ucmpoi2.c b/compiler-rt/lib/builtins/ucmpoi2.c
new file mode 100644
index 0000000000000..cbfbe23fe0836
--- /dev/null
+++ b/compiler-rt/lib/builtins/ucmpoi2.c
@@ -0,0 +1,37 @@
+//===-- ucmpoi2.c - Implement __ucmpoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ucmpoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns:  if (a <  b) returns 0
+//           if (a == b) returns 1
+//           if (a >  b) returns 2
+
+COMPILER_RT_ABI si_int __ucmpoi2(ou_int a, ou_int b) {
+  uowords x;
+  x.all = a;
+  uowords y;
+  y.all = b;
+  if (x.s.high < y.s.high)
+    return 0;
+  if (x.s.high > y.s.high)
+    return 2;
+  if (x.s.low < y.s.low)
+    return 0;
+  if (x.s.low > y.s.low)
+    return 2;
+  return 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/udivmodoi4.c b/compiler-rt/lib/builtins/udivmodoi4.c
new file mode 100644
index 0000000000000..9ae441a27b745
--- /dev/null
+++ b/compiler-rt/lib/builtins/udivmodoi4.c
@@ -0,0 +1,147 @@
+//===-- udivmodoi4.c - Implement __udivmodoi4 -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __udivmodoi4 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns the 256 bit division result by 128 bit. Result must fit in 128 bits.
+// Remainder stored in r.
+// Adapted from the 128/64 algorithm in udivmodti4.c.
+UNUSED
+static inline tu_int udiv256by128to128default(tu_int u1, tu_int u0, tu_int v,
+                                              tu_int *r) {
+  const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
+  const tu_int b = (tu_int)1 << (n_utword_bits / 2); // Number base (64 bits)
+  tu_int un1, un0;                                   // Norm. dividend LSD's
+  tu_int vn1, vn0;                                   // Norm. divisor digits
+  tu_int q1, q0;                                     // Quotient digits
+  tu_int un128, un21, un10;                          // Dividend digit pairs
+  tu_int rhat;                                       // A remainder
+  si_int s; // Shift amount for normalization
+
+  s = __clzti2(v);
+  if (s > 0) {
+    // Normalize the divisor.
+    v = v << s;
+    un128 = (u1 << s) | (u0 >> (n_utword_bits - s));
+    un10 = u0 << s;
+  } else {
+    // Avoid undefined behavior of (u0 >> 128).
+    un128 = u1;
+    un10 = u0;
+  }
+
+  // Break divisor up into two 64-bit digits.
+  vn1 = v >> (n_utword_bits / 2);
+  vn0 = v & (((tu_int)1 << (n_utword_bits / 2)) - 1);
+
+  // Break right half of dividend into two digits.
+  un1 = un10 >> (n_utword_bits / 2);
+  un0 = un10 & (((tu_int)1 << (n_utword_bits / 2)) - 1);
+
+  // Compute the first quotient digit, q1.
+  q1 = un128 / vn1;
+  rhat = un128 - q1 * vn1;
+
+  // q1 has at most error 2. No more than 2 iterations.
+  while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+    q1 = q1 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b)
+      break;
+  }
+
+  un21 = un128 * b + un1 - q1 * v;
+
+  // Compute the second quotient digit.
+  q0 = un21 / vn1;
+  rhat = un21 - q0 * vn1;
+
+  // q0 has at most error 2. No more than 2 iterations.
+  while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+    q0 = q0 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b)
+      break;
+  }
+
+  *r = (un21 * b + un0 - q0 * v) >> s;
+  return q1 * b + q0;
+}
+
+static inline tu_int udiv256by128to128(tu_int u1, tu_int u0, tu_int v,
+                                       tu_int *r) {
+  return udiv256by128to128default(u1, u0, v, r);
+}
+
+// Effects: if rem != 0, *rem = a % b
+// Returns: a / b
+
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem) {
+  const unsigned n_uoword_bits = sizeof(ou_int) * CHAR_BIT;
+  uowords dividend;
+  dividend.all = a;
+  uowords divisor;
+  divisor.all = b;
+  uowords quotient;
+  uowords remainder;
+  if (divisor.all > dividend.all) {
+    if (rem)
+      *rem = dividend.all;
+    return 0;
+  }
+  // When the divisor fits in 128 bits, we can use an optimized path.
+  if (divisor.s.high == 0) {
+    remainder.s.high = 0;
+    if (dividend.s.high < divisor.s.low) {
+      // The result fits in 128 bits.
+      quotient.s.low = udiv256by128to128(dividend.s.high, dividend.s.low,
+                                         divisor.s.low, &remainder.s.low);
+      quotient.s.high = 0;
+    } else {
+      // First, divide with the high part to get the remainder in
+      // dividend.s.high. After that dividend.s.high < divisor.s.low.
+      quotient.s.high = dividend.s.high / divisor.s.low;
+      dividend.s.high = dividend.s.high % divisor.s.low;
+      quotient.s.low = udiv256by128to128(dividend.s.high, dividend.s.low,
+                                         divisor.s.low, &remainder.s.low);
+    }
+    if (rem)
+      *rem = remainder.all;
+    return quotient.all;
+  }
+  // 0 <= shift <= 127.
+  si_int shift = __clzti2(divisor.s.high) - __clzti2(dividend.s.high);
+  divisor.all <<= shift;
+  quotient.s.high = 0;
+  quotient.s.low = 0;
+  for (; shift >= 0; --shift) {
+    quotient.s.low <<= 1;
+    // Branch free version of.
+    // if (dividend.all >= divisor.all)
+    // {
+    //    dividend.all -= divisor.all;
+    //    carry = 1;
+    // }
+    const oi_int s =
+        (oi_int)(divisor.all - dividend.all - 1) >> (n_uoword_bits - 1);
+    quotient.s.low |= s & 1;
+    dividend.all -= divisor.all & s;
+    divisor.all >>= 1;
+  }
+  if (rem)
+    *rem = dividend.all;
+  return quotient.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/udivoi3.c b/compiler-rt/lib/builtins/udivoi3.c
new file mode 100644
index 0000000000000..a4f489c9c7f77
--- /dev/null
+++ b/compiler-rt/lib/builtins/udivoi3.c
@@ -0,0 +1,23 @@
+//===-- udivoi3.c - Implement __udivoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __udivoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b
+
+COMPILER_RT_ABI ou_int __udivoi3(ou_int a, ou_int b) {
+  return __udivmodoi4(a, b, 0);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/umodoi3.c b/compiler-rt/lib/builtins/umodoi3.c
new file mode 100644
index 0000000000000..3598777e1a78b
--- /dev/null
+++ b/compiler-rt/lib/builtins/umodoi3.c
@@ -0,0 +1,25 @@
+//===-- umodoi3.c - Implement __umodoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __umodoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a % b
+
+COMPILER_RT_ABI ou_int __umodoi3(ou_int a, ou_int b) {
+  ou_int r;
+  __udivmodoi4(a, b, &r);
+  return r;
+}
+
+#endif // CRT_HAS_256BIT

>From d441ec5cf91245b60fecc953890c39b9f15c2410 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:03 +0100
Subject: [PATCH 08/17] [compiler-rt][test] Add __int256 builtin tests

Add 40 unit tests for all __int256 builtins, covering:
- Full-width big-number arithmetic (multiply, div/mod, divmod)
- Shifts (logical/arithmetic left/right) with various shift amounts
- Bit operations (clz, ctz, ffs, popcount, parity)
- Overflow-checked operations (add, sub, mul, abs, neg)
- Float conversions (fix/fixuns/float/floatun for sf/df/tf/xf)
- Signed/unsigned comparisons

Tests use CRT_HAS_256BIT guard: compile on 32-bit but print "skipped".
Each test covers boundary cases (0, 1, max, min, powers of 2).
Update lit.cfg.py to provide has_int256 feature flag.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 compiler-rt/test/builtins/Unit/absvoi2_test.c |  67 +++++
 compiler-rt/test/builtins/Unit/addvoi3_test.c |  78 +++++
 compiler-rt/test/builtins/Unit/ashloi3_test.c | 122 ++++++++
 compiler-rt/test/builtins/Unit/ashroi3_test.c |  86 ++++++
 compiler-rt/test/builtins/Unit/clzoi2_test.c  |  78 +++++
 compiler-rt/test/builtins/Unit/cmpoi2_test.c  |  93 ++++++
 compiler-rt/test/builtins/Unit/ctzoi2_test.c  |  83 ++++++
 .../test/builtins/Unit/divmodoi4_test.c       |  97 +++++++
 compiler-rt/test/builtins/Unit/divoi3_test.c  |  97 +++++++
 compiler-rt/test/builtins/Unit/ffsoi2_test.c  |  86 ++++++
 compiler-rt/test/builtins/Unit/fixdfoi_test.c |  93 ++++++
 compiler-rt/test/builtins/Unit/fixsfoi_test.c |  98 +++++++
 compiler-rt/test/builtins/Unit/fixtfoi_test.c |  47 +++
 .../test/builtins/Unit/fixunsdfoi_test.c      |  47 +++
 .../test/builtins/Unit/fixunssfoi_test.c      |  47 +++
 .../test/builtins/Unit/fixunstfoi_test.c      |  43 +++
 .../test/builtins/Unit/fixunsxfoi_test.c      | 149 ++++++++++
 compiler-rt/test/builtins/Unit/fixxfoi_test.c | 144 ++++++++++
 .../test/builtins/Unit/floatoidf_test.c       |  89 ++++++
 .../test/builtins/Unit/floatoisf_test.c       |  77 +++++
 .../test/builtins/Unit/floatoitf_test.c       |  45 +++
 .../test/builtins/Unit/floatoixf_test.c       | 114 ++++++++
 .../test/builtins/Unit/floatunoidf_test.c     |  43 +++
 .../test/builtins/Unit/floatunoisf_test.c     |  41 +++
 .../test/builtins/Unit/floatunoitf_test.c     |  43 +++
 .../test/builtins/Unit/floatunoixf_test.c     | 123 ++++++++
 compiler-rt/test/builtins/Unit/lit.cfg.py     |  11 +
 compiler-rt/test/builtins/Unit/lshroi3_test.c | 101 +++++++
 compiler-rt/test/builtins/Unit/modoi3_test.c  |  82 ++++++
 compiler-rt/test/builtins/Unit/muloi5_test.c  | 164 +++++++++++
 compiler-rt/test/builtins/Unit/multi5_test.c  | 174 +++++++++++
 compiler-rt/test/builtins/Unit/mulvoi3_test.c | 119 ++++++++
 compiler-rt/test/builtins/Unit/negoi2_test.c  |  69 +++++
 compiler-rt/test/builtins/Unit/negvoi2_test.c |  59 ++++
 .../test/builtins/Unit/parityoi2_test.c       |  83 ++++++
 .../test/builtins/Unit/popcountoi2_test.c     |  86 ++++++
 compiler-rt/test/builtins/Unit/subvoi3_test.c |  81 ++++++
 compiler-rt/test/builtins/Unit/ucmpoi2_test.c |  89 ++++++
 .../test/builtins/Unit/udivmodoi4_test.c      | 272 ++++++++++++++++++
 compiler-rt/test/builtins/Unit/udivoi3_test.c |  92 ++++++
 compiler-rt/test/builtins/Unit/umodoi3_test.c |  80 ++++++
 41 files changed, 3692 insertions(+)
 create mode 100644 compiler-rt/test/builtins/Unit/absvoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/addvoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/ashloi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/ashroi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/clzoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/cmpoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/ctzoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/divmodoi4_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/divoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/ffsoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixdfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixsfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixtfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixunssfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixunstfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/fixxfoi_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatoidf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatoisf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatoitf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatoixf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatunoidf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatunoisf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatunoitf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/floatunoixf_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/lshroi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/modoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/muloi5_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/multi5_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/mulvoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/negoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/negvoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/parityoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/popcountoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/subvoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/ucmpoi2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/udivmodoi4_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/udivoi3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/umodoi3_test.c

diff --git a/compiler-rt/test/builtins/Unit/absvoi2_test.c b/compiler-rt/test/builtins/Unit/absvoi2_test.c
new file mode 100644
index 0000000000000..f26526f0054f9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/absvoi2_test.c
@@ -0,0 +1,67 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_absvoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __absvoi2(oi_int a);
+
+int test__absvoi2(oi_int a, oi_int expected) {
+  oi_int x = __absvoi2(a);
+  if (x != expected) {
+    printf("error in __absvoi2\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__absvoi2((oi_int)0, (oi_int)0))
+    return 1;
+  if (test__absvoi2((oi_int)1, (oi_int)1))
+    return 1;
+  if (test__absvoi2((oi_int)-1, (oi_int)1))
+    return 1;
+  if (test__absvoi2((oi_int)42, (oi_int)42))
+    return 1;
+  if (test__absvoi2((oi_int)-42, (oi_int)42))
+    return 1;
+  // Large positive value (already positive, no change)
+  {
+    oi_int big = make_oi(make_ti(0, 1), make_ti(0, 0));
+    if (test__absvoi2(big, big))
+      return 1;
+  }
+  // Large negative value
+  if (test__absvoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)),
+                    make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // MAX (already positive)
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    if (test__absvoi2(MAX, MAX))
+      return 1;
+  }
+  // Note: MIN would abort, so we don't test it.
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  // C is negative signed; abs(C) = |C|
+  if (test__absvoi2(
+          make_oi(make_ti(0xDDDDEEEEFFFF0000LL, 0x1111222233334444ULL),
+                  make_ti(0x5555666677778888ULL, 0x9999AAAABBBBCCCCULL)),
+          make_oi(make_ti(0x222211110000FFFFLL, 0xEEEEDDDDCCCCBBBBULL),
+                  make_ti(0xAAAA999988887777ULL, 0x6666555544443334ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/addvoi3_test.c b/compiler-rt/test/builtins/Unit/addvoi3_test.c
new file mode 100644
index 0000000000000..6cc2732cf63bd
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/addvoi3_test.c
@@ -0,0 +1,78 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __addvoi3(oi_int a, oi_int b);
+
+int test__addvoi3(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __addvoi3(a, b);
+  if (x != expected) {
+    printf("error in __addvoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__addvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+    return 1;
+  if (test__addvoi3((oi_int)1, (oi_int)1, (oi_int)2))
+    return 1;
+  if (test__addvoi3((oi_int)-1, (oi_int)1, (oi_int)0))
+    return 1;
+  if (test__addvoi3((oi_int)100, (oi_int)200, (oi_int)300))
+    return 1;
+  // Large values in low half (carry across 64-bit boundary)
+  if (test__addvoi3(make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL)),
+                    make_oi(make_ti(0, 0), make_ti(0, 1)),
+                    make_oi(make_ti(0, 0), make_ti(1, 0))))
+    return 1;
+  // Carry across 128-bit boundary (low half to high half)
+  if (test__addvoi3(make_oi(make_ti(0, 0), make_ti(-1, -1)),
+                    make_oi(make_ti(0, 0), make_ti(0, 1)),
+                    make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // Negative + negative
+  if (test__addvoi3((oi_int)-100, (oi_int)-200, (oi_int)-300))
+    return 1;
+  // Large positive values
+  if (test__addvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                    make_oi(make_ti(0, 2), make_ti(0, 0)),
+                    make_oi(make_ti(0, 3), make_ti(0, 0))))
+    return 1;
+  // Identity: x + 0
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__addvoi3(big, (oi_int)0, big))
+      return 1;
+  }
+  // Additive inverse
+  if (test__addvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                    make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)0))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  // A(signed) + B(signed) = 0xBBBBDDDE...99981111
+  if (test__addvoi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0xBBBBDDDE00002222LL, 0x444466658888AAAAULL),
+                  make_ti(0xCCCCEEEF11113333ULL, 0x5555777799981111ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ashloi3_test.c b/compiler-rt/test/builtins/Unit/ashloi3_test.c
new file mode 100644
index 0000000000000..62f13f21e1941
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ashloi3_test.c
@@ -0,0 +1,122 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ashloi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a << b
+
+// Precondition:  0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashloi3(oi_int a, int b);
+
+int test__ashloi3(oi_int a, int b, oi_int expected) {
+  oi_int x = __ashloi3(a, b);
+  if (x != expected) {
+    owords xt;
+    xt.all = x;
+    owords expectedt;
+    expectedt.all = expected;
+    printf("error in __ashloi3: shift by %d\n", b);
+    printf("  got:      0x%.16llX%.16llX%.16llX%.16llX\n",
+           (unsigned long long)((tu_int)xt.s.high >> 64),
+           (unsigned long long)xt.s.high,
+           (unsigned long long)((tu_int)xt.s.low >> 64),
+           (unsigned long long)xt.s.low);
+    printf("  expected: 0x%.16llX%.16llX%.16llX%.16llX\n",
+           (unsigned long long)((tu_int)expectedt.s.high >> 64),
+           (unsigned long long)expectedt.s.high,
+           (unsigned long long)((tu_int)expectedt.s.low >> 64),
+           (unsigned long long)expectedt.s.low);
+  }
+  return x != expected;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Shift by 0 (identity)
+  if (test__ashloi3(
+          make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+                  make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL)),
+          0,
+          make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+                  make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL))))
+    return 1;
+  // Shift by 1
+  if (test__ashloi3((ou_int)1, 1, (ou_int)2))
+    return 1;
+  if (test__ashloi3((ou_int)1, 2, (ou_int)4))
+    return 1;
+  if (test__ashloi3((ou_int)1, 4, (ou_int)16))
+    return 1;
+  // Shift by 63 (within first 64-bit word)
+  if (test__ashloi3((ou_int)1, 63,
+                    make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL))))
+    return 1;
+  // Shift by 64 (crosses into second 64-bit word)
+  if (test__ashloi3((ou_int)1, 64, make_oi(make_ti(0, 0), make_ti(1, 0))))
+    return 1;
+  // Shift by 127 (top of low 128-bit half)
+  if (test__ashloi3((ou_int)1, 127,
+                    make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  // Shift by 128 (crosses into high 128-bit half)
+  if (test__ashloi3((ou_int)1, 128, make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // Shift by 129
+  if (test__ashloi3((ou_int)1, 129, make_oi(make_ti(0, 2), make_ti(0, 0))))
+    return 1;
+  // Shift by 191
+  if (test__ashloi3((ou_int)1, 191,
+                    make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0))))
+    return 1;
+  // Shift by 192
+  if (test__ashloi3((ou_int)1, 192, make_oi(make_ti(1, 0), make_ti(0, 0))))
+    return 1;
+  // Shift by 255 (MSB)
+  if (test__ashloi3((ou_int)1, 255,
+                    make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0))))
+    return 1;
+  // Multi-bit value shift by 64
+  if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 64,
+                    make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFULL, 0))))
+    return 1;
+  // Multi-bit value shift by 128
+  if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 128,
+                    make_oi(make_ti(0, 0xFFFFFFFFFFFFFFFFULL), make_ti(0, 0))))
+    return 1;
+  // Multi-bit value shift by 192
+  if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 192,
+                    make_oi(make_ti(0xFFFFFFFFFFFFFFFFULL, 0), make_ti(0, 0))))
+    return 1;
+  // Full value shift crossing half boundary
+  if (test__ashloi3(make_oi(make_ti(0, 0), make_ti(0, 0xABCDLL)), 4,
+                    make_oi(make_ti(0, 0), make_ti(0, 0xABCD0LL))))
+    return 1;
+  // Shift that spans both halves
+  if (test__ashloi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL,
+                                                   0x0000000000000001LL)),
+                    1,
+                    make_oi(make_ti(0, 1), make_ti(0, 0x0000000000000002LL))))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated, shift crosses 64-bit boundary).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__ashloi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          73,
+          make_oi(make_ti(0xDDFFFE2222444466LL, 0x668888AAAACCCCEEULL),
+                  make_ti(0xEF11113332000000ULL, 0x0000000000000000ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ashroi3_test.c b/compiler-rt/test/builtins/Unit/ashroi3_test.c
new file mode 100644
index 0000000000000..a48d3c160edfd
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ashroi3_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ashroi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __ashroi3(oi_int a, int b);
+
+int test__ashroi3(oi_int a, int b, oi_int expected) {
+  oi_int x = __ashroi3(a, b);
+  if (x != expected) {
+    printf("error in __ashroi3: shift by %d\n", b);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Shift by 0
+  if (test__ashroi3((oi_int)1, 0, (oi_int)1))
+    return 1;
+  // Shift positive by small amounts
+  if (test__ashroi3((oi_int)2, 1, (oi_int)1))
+    return 1;
+  if (test__ashroi3((oi_int)4, 2, (oi_int)1))
+    return 1;
+  // Shift negative by 1 (sign extension)
+  if (test__ashroi3((oi_int)-2, 1, (oi_int)-1))
+    return 1;
+  // Shift -1 by any amount stays -1 (sign extension)
+  if (test__ashroi3((oi_int)-1, 1, (oi_int)-1))
+    return 1;
+  if (test__ashroi3((oi_int)-1, 64, (oi_int)-1))
+    return 1;
+  if (test__ashroi3((oi_int)-1, 128, (oi_int)-1))
+    return 1;
+  if (test__ashroi3((oi_int)-1, 255, (oi_int)-1))
+    return 1;
+  // Shift by 64 (within low half)
+  if (test__ashroi3(make_oi(make_ti(0, 0), make_ti(0xABCD000000000000LL, 0)),
+                    64,
+                    make_oi(make_ti(0, 0), make_ti(0, 0xABCD000000000000ULL))))
+    return 1;
+  // Shift by 128 (crosses half boundary, positive)
+  if (test__ashroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 128, (oi_int)1))
+    return 1;
+  // Shift by 128 (negative, sign extends)
+  if (test__ashroi3(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                    128,
+                    make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  // Shift by 192
+  if (test__ashroi3(make_oi(make_ti(0x0000ABCD00000000LL, 0), make_ti(0, 0)),
+                    192, (oi_int)0x0000ABCD00000000LL))
+    return 1;
+  // Shift MSB-only by 255
+  if (test__ashroi3(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                    255, (oi_int)-1))
+    return 1;
+  // Shift MAX positive by 255
+  if (test__ashroi3(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                    255, (oi_int)0))
+    return 1;
+  // Full-width big-number test (negative value, shift crosses 64-bit boundary).
+  // A is negative in signed interpretation; arithmetic shift sign-extends.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__ashroi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          73,
+          make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFD5555DDDE6666EULL),
+                  make_ti(0xEEF7777FFF888891ULL, 0x111999A2222AAAB3ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/clzoi2_test.c b/compiler-rt/test/builtins/Unit/clzoi2_test.c
new file mode 100644
index 0000000000000..9b58e848b19db
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/clzoi2_test.c
@@ -0,0 +1,78 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_clzoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __clzoi2(oi_int a);
+
+int test__clzoi2(oi_int a, int expected) {
+  int x = __clzoi2(a);
+  if (x != expected) {
+    printf("error in __clzoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Single bit in MSB position
+  if (test__clzoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)), 0))
+    return 1;
+  // Single bit in high half, lower position (bit 128)
+  if (test__clzoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 127))
+    return 1;
+  // Single bit at position 128 (MSB of low half)
+  if (test__clzoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                   128))
+    return 1;
+  // 1
+  if (test__clzoi2((oi_int)1, 255))
+    return 1;
+  // All ones
+  if (test__clzoi2((oi_int)(ou_int)-1, 0))
+    return 1;
+  // Value in high word only
+  if (test__clzoi2(make_oi(make_ti(0, 0xFFLL), make_ti(0, 0)), 120))
+    return 1;
+  // Bit at position 64 (second 64-bit word)
+  if (test__clzoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 191))
+    return 1;
+  // Bit at position 192
+  if (test__clzoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 63))
+    return 1;
+  // 0xFF in low word only
+  if (test__clzoi2((oi_int)0xFF, 248))
+    return 1;
+  // Single bit at position 191
+  if (test__clzoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+                   64))
+    return 1;
+  // Power of 2 at position 200
+  if (test__clzoi2(make_oi(make_ti(0x100LL, 0), make_ti(0, 0)), 55))
+    return 1;
+  // Full-width big-number tests.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__clzoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          0))
+    return 1;
+  if (test__clzoi2(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          3))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/cmpoi2_test.c b/compiler-rt/test/builtins/Unit/cmpoi2_test.c
new file mode 100644
index 0000000000000..56682d84f7ba2
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/cmpoi2_test.c
@@ -0,0 +1,93 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_cmpoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI si_int __cmpoi2(oi_int a, oi_int b);
+
+int test__cmpoi2(oi_int a, oi_int b, si_int expected) {
+  si_int x = __cmpoi2(a, b);
+  if (x != expected) {
+    printf("error in __cmpoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Equal
+  if (test__cmpoi2((oi_int)0, (oi_int)0, 1))
+    return 1;
+  if (test__cmpoi2((oi_int)1, (oi_int)1, 1))
+    return 1;
+  if (test__cmpoi2((oi_int)-1, (oi_int)-1, 1))
+    return 1;
+  // Less than
+  if (test__cmpoi2((oi_int)0, (oi_int)1, 0))
+    return 1;
+  if (test__cmpoi2((oi_int)-1, (oi_int)0, 0))
+    return 1;
+  // Greater than
+  if (test__cmpoi2((oi_int)1, (oi_int)0, 2))
+    return 1;
+  if (test__cmpoi2((oi_int)0, (oi_int)-1, 2))
+    return 1;
+  // Large values: high half > low half
+  if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                   make_oi(make_ti(0, 0), make_ti(-1, -1)), 2))
+    return 1;
+  // Large equal values
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__cmpoi2(big, big, 1))
+      return 1;
+  }
+  // MAX > 0
+  if (test__cmpoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                   (oi_int)0, 2))
+    return 1;
+  // MIN < 0
+  if (test__cmpoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                   (oi_int)0, 0))
+    return 1;
+  // MIN < MAX
+  if (test__cmpoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                   make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                   0))
+    return 1;
+  // Differ only in low half
+  if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 1)),
+                   make_oi(make_ti(0, 1), make_ti(0, 2)), 0))
+    return 1;
+  if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 2)),
+                   make_oi(make_ti(0, 1), make_ti(0, 1)), 2))
+    return 1;
+  // Negative values: -1 > -2
+  if (test__cmpoi2((oi_int)-1, (oi_int)-2, 2))
+    return 1;
+  if (test__cmpoi2((oi_int)-2, (oi_int)-1, 0))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // A is negative signed, B is positive signed, so A < B.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__cmpoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ctzoi2_test.c b/compiler-rt/test/builtins/Unit/ctzoi2_test.c
new file mode 100644
index 0000000000000..4a891e8b9320b
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ctzoi2_test.c
@@ -0,0 +1,83 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ctzoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __ctzoi2(oi_int a);
+
+int test__ctzoi2(oi_int a, int expected) {
+  int x = __ctzoi2(a);
+  if (x != expected) {
+    printf("error in __ctzoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // 1
+  if (test__ctzoi2((oi_int)1, 0))
+    return 1;
+  // 2
+  if (test__ctzoi2((oi_int)2, 1))
+    return 1;
+  // Bit at position 63
+  if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+                   63))
+    return 1;
+  // Bit at position 64
+  if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 64))
+    return 1;
+  // Bit at position 127
+  if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                   127))
+    return 1;
+  // Bit at position 128
+  if (test__ctzoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 128))
+    return 1;
+  // Bit at position 191
+  if (test__ctzoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+                   191))
+    return 1;
+  // Bit at position 192
+  if (test__ctzoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 192))
+    return 1;
+  // Bit at position 255 (MSB)
+  if (test__ctzoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                   255))
+    return 1;
+  // All ones
+  if (test__ctzoi2((oi_int)(ou_int)-1, 0))
+    return 1;
+  // Multiple bits, lowest is position 8
+  if (test__ctzoi2((oi_int)0xFF00, 8))
+    return 1;
+  // Bits in both halves, lowest in low half
+  if (test__ctzoi2(make_oi(make_ti(0, 1), make_ti(0, 0x100)), 8))
+    return 1;
+  // Full-width big-number tests.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__ctzoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          16))
+    return 1;
+  if (test__ctzoi2(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/divmodoi4_test.c b/compiler-rt/test/builtins/Unit/divmodoi4_test.c
new file mode 100644
index 0000000000000..c9526a33eee30
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/divmodoi4_test.c
@@ -0,0 +1,97 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_divmodoi4
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __divmodoi4(oi_int a, oi_int b, oi_int *rem);
+
+int test__divmodoi4(oi_int a, oi_int b, oi_int expected_q, oi_int expected_r) {
+  oi_int r;
+  oi_int q = __divmodoi4(a, b, &r);
+  if (q != expected_q || r != expected_r) {
+    printf("error in __divmodoi4\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__divmodoi4((oi_int)0, (oi_int)1, (oi_int)0, (oi_int)0))
+    return 1;
+  if (test__divmodoi4((oi_int)10, (oi_int)3, (oi_int)3, (oi_int)1))
+    return 1;
+  if (test__divmodoi4((oi_int)-10, (oi_int)3, (oi_int)-3, (oi_int)-1))
+    return 1;
+  if (test__divmodoi4((oi_int)10, (oi_int)-3, (oi_int)-3, (oi_int)1))
+    return 1;
+  if (test__divmodoi4((oi_int)-10, (oi_int)-3, (oi_int)3, (oi_int)-1))
+    return 1;
+  if (test__divmodoi4((oi_int)100, (oi_int)7, (oi_int)14, (oi_int)2))
+    return 1;
+  // Exact division
+  if (test__divmodoi4((oi_int)42, (oi_int)42, (oi_int)1, (oi_int)0))
+    return 1;
+  // Dividend smaller than divisor
+  if (test__divmodoi4((oi_int)3, (oi_int)10, (oi_int)0, (oi_int)3))
+    return 1;
+  // (1 << 128) / 2
+  if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2,
+                      make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                      (oi_int)0))
+    return 1;
+  // (1 << 128) / 3 with remainder
+  if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+                      make_oi(make_ti(0, 0), make_ti(0x5555555555555555LL,
+                                                     0x5555555555555555ULL)),
+                      (oi_int)1))
+    return 1;
+  // Negative large / positive small
+  if (test__divmodoi4(
+          make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)2,
+          make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0)),
+          (oi_int)0))
+    return 1;
+  // Positive large / negative small
+  if (test__divmodoi4(
+          make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)-2,
+          make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0)),
+          (oi_int)0))
+    return 1;
+  // Large / large (same value)
+  {
+    oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+    if (test__divmodoi4(big, big, (oi_int)1, (oi_int)0))
+      return 1;
+  }
+  // Cross-half boundary value
+  if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 5)), (oi_int)4,
+                      make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 1)),
+                      (oi_int)1))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // A(signed) divmod B(signed): q = -4, r verified by Python: q*b + r == a.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__divmodoi4(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFULL),
+                  make_ti(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFCULL)),
+          make_oi(make_ti(0xEEEF44449999EEEFLL, 0x44449998EEEF4444ULL),
+                  make_ti(0x9999EEEF44449999ULL, 0xEEEF444499954444ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/divoi3_test.c b/compiler-rt/test/builtins/Unit/divoi3_test.c
new file mode 100644
index 0000000000000..00b8b65496eb2
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/divoi3_test.c
@@ -0,0 +1,97 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_divoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __divoi3(oi_int a, oi_int b);
+
+int test__divoi3(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __divoi3(a, b);
+  if (x != expected) {
+    printf("error in __divoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__divoi3((oi_int)0, (oi_int)1, (oi_int)0))
+    return 1;
+  if (test__divoi3((oi_int)10, (oi_int)3, (oi_int)3))
+    return 1;
+  if (test__divoi3((oi_int)-10, (oi_int)3, (oi_int)-3))
+    return 1;
+  if (test__divoi3((oi_int)10, (oi_int)-3, (oi_int)-3))
+    return 1;
+  if (test__divoi3((oi_int)-10, (oi_int)-3, (oi_int)3))
+    return 1;
+  if (test__divoi3((oi_int)1, (oi_int)1, (oi_int)1))
+    return 1;
+  if (test__divoi3((oi_int)100, (oi_int)10, (oi_int)10))
+    return 1;
+  // Large dividend in high half / small divisor
+  // (1 << 128) / 2 = (1 << 127)
+  if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2,
+                   make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  // (1 << 128) / 3
+  if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+                   make_oi(make_ti(0, 0), make_ti(0x5555555555555555LL,
+                                                  0x5555555555555555ULL))))
+    return 1;
+  // Negative large dividend
+  // -(1 << 128) / 2 = -(1 << 127)
+  if (test__divoi3(make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)2,
+                   make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  // Large / large (same value)
+  {
+    oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+    if (test__divoi3(big, big, (oi_int)1))
+      return 1;
+  }
+  // Large / large (double)
+  {
+    oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+    oi_int dbl = make_oi(make_ti(0, 0x200), make_ti(0, 0));
+    if (test__divoi3(dbl, big, (oi_int)2))
+      return 1;
+  }
+  // Dividend smaller than divisor
+  if (test__divoi3((oi_int)3, (oi_int)10, (oi_int)0))
+    return 1;
+  // Large negative / large negative
+  {
+    oi_int neg = make_oi(make_ti(-1, -2), make_ti(0, 0));
+    if (test__divoi3(neg, neg, (oi_int)1))
+      return 1;
+  }
+  // Cross-half boundary: value spans both halves
+  if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 4)), (oi_int)4,
+                   make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 1))))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // A(signed) / B(signed) = -4 (truncation toward zero).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__divoi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFULL),
+                  make_ti(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFCULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ffsoi2_test.c b/compiler-rt/test/builtins/Unit/ffsoi2_test.c
new file mode 100644
index 0000000000000..30bbfdd3c489e
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ffsoi2_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ffsoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __ffsoi2(oi_int a);
+
+int test__ffsoi2(oi_int a, int expected) {
+  int x = __ffsoi2(a);
+  if (x != expected) {
+    printf("error in __ffsoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Zero
+  if (test__ffsoi2((oi_int)0, 0))
+    return 1;
+  // 1 (bit 0 set)
+  if (test__ffsoi2((oi_int)1, 1))
+    return 1;
+  // 2 (bit 1 set)
+  if (test__ffsoi2((oi_int)2, 2))
+    return 1;
+  // Bit 63 set
+  if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+                   64))
+    return 1;
+  // Bit 64 set
+  if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 65))
+    return 1;
+  // Bit 127 set
+  if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                   128))
+    return 1;
+  // Bit 128 set
+  if (test__ffsoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 129))
+    return 1;
+  // Bit 191 set
+  if (test__ffsoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+                   192))
+    return 1;
+  // Bit 192 set
+  if (test__ffsoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 193))
+    return 1;
+  // Bit 255 set (MSB)
+  if (test__ffsoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                   256))
+    return 1;
+  // All ones
+  if (test__ffsoi2((oi_int)(ou_int)-1, 1))
+    return 1;
+  // Multiple bits, lowest is bit 8
+  if (test__ffsoi2((oi_int)0xFF00, 9))
+    return 1;
+  // Bits in both halves, lowest in low half
+  if (test__ffsoi2(make_oi(make_ti(0, 1), make_ti(0, 0x100)), 9))
+    return 1;
+  // Full-width big-number tests.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__ffsoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          17))
+    return 1;
+  if (test__ffsoi2(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          1))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixdfoi_test.c b/compiler-rt/test/builtins/Unit/fixdfoi_test.c
new file mode 100644
index 0000000000000..29d57ee18a690
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixdfoi_test.c
@@ -0,0 +1,93 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixdfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __fixdfoi(double a);
+
+int test__fixdfoi(double a, oi_int expected) {
+  oi_int x = __fixdfoi(a);
+  if (x != expected) {
+    printf("error in __fixdfoi(%f)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__fixdfoi(0.0, (oi_int)0))
+    return 1;
+  if (test__fixdfoi(1.0, (oi_int)1))
+    return 1;
+  if (test__fixdfoi(-1.0, (oi_int)-1))
+    return 1;
+  if (test__fixdfoi(42.0, (oi_int)42))
+    return 1;
+  if (test__fixdfoi(-42.0, (oi_int)-42))
+    return 1;
+  if (test__fixdfoi(1e18, (oi_int)1000000000000000000LL))
+    return 1;
+  if (test__fixdfoi(0.5, (oi_int)0))
+    return 1;
+  if (test__fixdfoi(-0.5, (oi_int)0))
+    return 1;
+  if (test__fixdfoi(1.5, (oi_int)1))
+    return 1;
+  if (test__fixdfoi(-1.5, (oi_int)-1))
+    return 1;
+  if (test__fixdfoi(100.0, (oi_int)100))
+    return 1;
+  if (test__fixdfoi(-100.0, (oi_int)-100))
+    return 1;
+  // Rounding toward zero
+  if (test__fixdfoi(0.99, (oi_int)0))
+    return 1;
+  if (test__fixdfoi(1.99, (oi_int)1))
+    return 1;
+  if (test__fixdfoi(-0.99, (oi_int)0))
+    return 1;
+  if (test__fixdfoi(-1.99, (oi_int)-1))
+    return 1;
+  if (test__fixdfoi(2.01, (oi_int)2))
+    return 1;
+  // Double mantissa boundary: 52 bits (53 with implicit 1)
+  // 0x1.FFFFFFFFFFFFFp+62 = max double < 2^63
+  if (test__fixdfoi(0x1.FFFFFFFFFFFFFp+62, (oi_int)0x7FFFFFFFFFFFFC00LL))
+    return 1;
+  if (test__fixdfoi(-0x1.FFFFFFFFFFFFFp+62, -(oi_int)0x7FFFFFFFFFFFFC00LL))
+    return 1;
+  // Exact powers of 2 in the 128+ bit range
+  if (test__fixdfoi(0x1.0p+64, (oi_int)1 << 64))
+    return 1;
+  if (test__fixdfoi(0x1.0p+127, (oi_int)1 << 127))
+    return 1;
+  if (test__fixdfoi(0x1.0p+200, (oi_int)1 << 200))
+    return 1;
+  // Negative large
+  if (test__fixdfoi(-0x1.0p+127, -((oi_int)1 << 127)))
+    return 1;
+  // Values at the double mantissa limit (52-bit precision):
+  // 0x1.FFFFFFFFFFFFFp+126 -- max double in ~127-bit range
+  if (test__fixdfoi(0x1.FFFFFFFFFFFFFp+126,
+                    make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFC00LL, 0))))
+    return 1;
+  if (test__fixdfoi(-0x1.FFFFFFFFFFFFFp+126,
+                    make_oi(make_ti(-1, -1), make_ti(0x8000000000000400LL, 0))))
+    return 1;
+  // Specific hex value (from 128-bit reference test)
+  if (test__fixdfoi(0x1.1A3CFE870496Ep+57, (oi_int)0x023479FD0E092DC0LL))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixsfoi_test.c b/compiler-rt/test/builtins/Unit/fixsfoi_test.c
new file mode 100644
index 0000000000000..860fa926b373d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixsfoi_test.c
@@ -0,0 +1,98 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixsfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __fixsfoi(float a);
+
+int test__fixsfoi(float a, oi_int expected) {
+  oi_int x = __fixsfoi(a);
+  if (x != expected) {
+    printf("error in __fixsfoi(%f)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__fixsfoi(0.0f, (oi_int)0))
+    return 1;
+  if (test__fixsfoi(1.0f, (oi_int)1))
+    return 1;
+  if (test__fixsfoi(-1.0f, (oi_int)-1))
+    return 1;
+  if (test__fixsfoi(42.0f, (oi_int)42))
+    return 1;
+  if (test__fixsfoi(-42.0f, (oi_int)-42))
+    return 1;
+  if (test__fixsfoi(0.5f, (oi_int)0))
+    return 1;
+  if (test__fixsfoi(1.5f, (oi_int)1))
+    return 1;
+  if (test__fixsfoi(-0.5f, (oi_int)0))
+    return 1;
+  if (test__fixsfoi(-1.5f, (oi_int)-1))
+    return 1;
+  if (test__fixsfoi(100.0f, (oi_int)100))
+    return 1;
+  if (test__fixsfoi(-100.0f, (oi_int)-100))
+    return 1;
+  if (test__fixsfoi(1e6f, (oi_int)1000000))
+    return 1;
+  // Rounding toward zero for fractional parts
+  if (test__fixsfoi(0.99f, (oi_int)0))
+    return 1;
+  if (test__fixsfoi(1.99f, (oi_int)1))
+    return 1;
+  if (test__fixsfoi(-0.99f, (oi_int)0))
+    return 1;
+  if (test__fixsfoi(-1.99f, (oi_int)-1))
+    return 1;
+  if (test__fixsfoi(2.0f, (oi_int)2))
+    return 1;
+  if (test__fixsfoi(2.01f, (oi_int)2))
+    return 1;
+  if (test__fixsfoi(-2.0f, (oi_int)-2))
+    return 1;
+  // Precision boundary: float has 23 mantissa bits
+  // 0x1.FFFFFEp+62 = max float < 2^63, mantissa fully used
+  if (test__fixsfoi(0x1.FFFFFEp+62F, (oi_int)0x7FFFFF8000000000LL))
+    return 1;
+  if (test__fixsfoi(-0x1.FFFFFEp+62F, -(oi_int)0x7FFFFF8000000000LL))
+    return 1;
+  // Large float that needs >64 bits to represent
+  // 0x1.0p+64 = 2^64 = 18446744073709551616
+  if (test__fixsfoi(0x1.0p+64F, (oi_int)1 << 64))
+    return 1;
+  // 0x1.0p+127 = 2^127
+  if (test__fixsfoi(0x1.0p+127F, (oi_int)1 << 127))
+    return 1;
+  // Largest finite float: 0x1.FFFFFEp+127 = (2^24 - 1) * 2^104
+  // This fits in oi_int (it's only ~128 bits).
+  if (test__fixsfoi(0x1.FFFFFEp+127F, (oi_int)0xFFFFFF << 104))
+    return 1;
+  // Negative large
+  if (test__fixsfoi(-0x1.0p+127F, -((oi_int)1 << 127)))
+    return 1;
+  // Infinity should saturate to max
+  if (test__fixsfoi(__builtin_inff(), make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1),
+                                              make_ti(-1, -1))))
+    return 1;
+  // Negative infinity should saturate to min
+  if (test__fixsfoi(-__builtin_inff(),
+                    make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixtfoi_test.c b/compiler-rt/test/builtins/Unit/fixtfoi_test.c
new file mode 100644
index 0000000000000..08879a61d19b3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixtfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixtfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+
+COMPILER_RT_ABI oi_int __fixtfoi(long double a);
+
+int test__fixtfoi(long double a, oi_int expected) {
+  oi_int x = __fixtfoi(a);
+  if (x != expected) {
+    printf("error in __fixtfoi\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+  if (test__fixtfoi(0.0L, (oi_int)0))
+    return 1;
+  if (test__fixtfoi(1.0L, (oi_int)1))
+    return 1;
+  if (test__fixtfoi(-1.0L, (oi_int)-1))
+    return 1;
+  if (test__fixtfoi(42.0L, (oi_int)42))
+    return 1;
+  if (test__fixtfoi(-42.0L, (oi_int)-42))
+    return 1;
+  if (test__fixtfoi(0.5L, (oi_int)0))
+    return 1;
+  if (test__fixtfoi(1.5L, (oi_int)1))
+    return 1;
+  if (test__fixtfoi(-0.5L, (oi_int)0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c b/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
new file mode 100644
index 0000000000000..01a3a363fc000
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunsdfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __fixunsdfoi(double a);
+
+int test__fixunsdfoi(double a, ou_int expected) {
+  ou_int x = __fixunsdfoi(a);
+  if (x != expected) {
+    printf("error in __fixunsdfoi(%f)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__fixunsdfoi(0.0, (ou_int)0))
+    return 1;
+  if (test__fixunsdfoi(1.0, (ou_int)1))
+    return 1;
+  if (test__fixunsdfoi(42.0, (ou_int)42))
+    return 1;
+  if (test__fixunsdfoi(1e18, (ou_int)1000000000000000000ULL))
+    return 1;
+  if (test__fixunsdfoi(-1.0, (ou_int)0))
+    return 1;
+  if (test__fixunsdfoi(0.5, (ou_int)0))
+    return 1;
+  if (test__fixunsdfoi(1.5, (ou_int)1))
+    return 1;
+  if (test__fixunsdfoi(100.0, (ou_int)100))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunssfoi_test.c b/compiler-rt/test/builtins/Unit/fixunssfoi_test.c
new file mode 100644
index 0000000000000..57cea91d4191c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunssfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunssfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __fixunssfoi(float a);
+
+int test__fixunssfoi(float a, ou_int expected) {
+  ou_int x = __fixunssfoi(a);
+  if (x != expected) {
+    printf("error in __fixunssfoi(%f)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__fixunssfoi(0.0f, (ou_int)0))
+    return 1;
+  if (test__fixunssfoi(1.0f, (ou_int)1))
+    return 1;
+  if (test__fixunssfoi(42.0f, (ou_int)42))
+    return 1;
+  if (test__fixunssfoi(-1.0f, (ou_int)0))
+    return 1;
+  if (test__fixunssfoi(0.5f, (ou_int)0))
+    return 1;
+  if (test__fixunssfoi(1.5f, (ou_int)1))
+    return 1;
+  if (test__fixunssfoi(100.0f, (ou_int)100))
+    return 1;
+  if (test__fixunssfoi(1e6f, (ou_int)1000000))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunstfoi_test.c b/compiler-rt/test/builtins/Unit/fixunstfoi_test.c
new file mode 100644
index 0000000000000..b28859d0d4064
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunstfoi_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunstfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+
+COMPILER_RT_ABI ou_int __fixunstfoi(long double a);
+
+int test__fixunstfoi(long double a, ou_int expected) {
+  ou_int x = __fixunstfoi(a);
+  if (x != expected) {
+    printf("error in __fixunstfoi\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+  if (test__fixunstfoi(0.0L, (ou_int)0))
+    return 1;
+  if (test__fixunstfoi(1.0L, (ou_int)1))
+    return 1;
+  if (test__fixunstfoi(42.0L, (ou_int)42))
+    return 1;
+  if (test__fixunstfoi(0.5L, (ou_int)0))
+    return 1;
+  if (test__fixunstfoi(1.5L, (ou_int)1))
+    return 1;
+  if (test__fixunstfoi(1000000.0L, (ou_int)1000000))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c b/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
new file mode 100644
index 0000000000000..c906bca167ced
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
@@ -0,0 +1,149 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunsxfoi
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to an unsigned 256-bit integer, rounding toward zero.
+//          Negative values all become zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes
+//             ou_int is a 256 bit integral type
+//             value in long double is representable in ou_int or is negative
+//                 (no range checking performed)
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI ou_int __fixunsxfoi(long double a);
+
+int test__fixunsxfoi(long double a, ou_int expected) {
+  ou_int x = __fixunsxfoi(a);
+  if (x != expected) {
+    printf("error in __fixunsxfoi(%LA)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(ou_int) == 2 * sizeof(tu_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+  if (test__fixunsxfoi(0.0, 0))
+    return 1;
+
+  if (test__fixunsxfoi(0.5, 0))
+    return 1;
+  if (test__fixunsxfoi(0.99, 0))
+    return 1;
+  if (test__fixunsxfoi(1.0, 1))
+    return 1;
+  if (test__fixunsxfoi(1.5, 1))
+    return 1;
+  if (test__fixunsxfoi(1.99, 1))
+    return 1;
+  if (test__fixunsxfoi(2.0, 2))
+    return 1;
+  if (test__fixunsxfoi(2.01, 2))
+    return 1;
+  if (test__fixunsxfoi(-0.5, 0))
+    return 1;
+  if (test__fixunsxfoi(-0.99, 0))
+    return 1;
+  if (test__fixunsxfoi(-1.0, 0))
+    return 1;
+  if (test__fixunsxfoi(-1.5, 0))
+    return 1;
+  if (test__fixunsxfoi(-1.99, 0))
+    return 1;
+  if (test__fixunsxfoi(-2.0, 0))
+    return 1;
+  if (test__fixunsxfoi(-2.01, 0))
+    return 1;
+
+  // Float precision boundary tests
+  if (test__fixunsxfoi(0x1.FFFFFEp+62, 0x7FFFFF8000000000LL))
+    return 1;
+  if (test__fixunsxfoi(0x1.FFFFFCp+62, 0x7FFFFF0000000000LL))
+    return 1;
+
+  if (test__fixunsxfoi(-0x1.FFFFFEp+62, 0))
+    return 1;
+  if (test__fixunsxfoi(-0x1.FFFFFCp+62, 0))
+    return 1;
+
+  // Double precision boundary tests
+  if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFp+62, 0x7FFFFFFFFFFFFC00LL))
+    return 1;
+  if (test__fixunsxfoi(0x1.FFFFFFFFFFFFEp+62, 0x7FFFFFFFFFFFF800LL))
+    return 1;
+
+  if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFp+62, 0))
+    return 1;
+  if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFEp+62, 0))
+    return 1;
+
+  // Long double (80-bit) full precision tests near 64-bit boundary
+  if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFFEp+63L, 0xFFFFFFFFFFFFFFFFLL))
+    return 1;
+  if (test__fixunsxfoi(0x1.0000000000000002p+63L, 0x8000000000000001LL))
+    return 1;
+  if (test__fixunsxfoi(0x1.0000000000000000p+63L, 0x8000000000000000LL))
+    return 1;
+  if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFFCp+62L, 0x7FFFFFFFFFFFFFFFLL))
+    return 1;
+  if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFF8p+62L, 0x7FFFFFFFFFFFFFFELL))
+    return 1;
+
+  if (test__fixunsxfoi(-0x1.0000000000000000p+63L, 0))
+    return 1;
+  if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFFFCp+62L, 0))
+    return 1;
+  if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFFF8p+62L, 0))
+    return 1;
+
+  // Tests at 128-bit boundary
+  if (test__fixunsxfoi(
+          0x1.FFFFFFFFFFFFFFFEp+127L,
+          make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL, 0))))
+    return 1;
+  if (test__fixunsxfoi(
+          0x1.0000000000000002p+127L,
+          make_oi(make_ti(0, 0), make_ti(0x8000000000000001LL, 0))))
+    return 1;
+  if (test__fixunsxfoi(
+          0x1.0000000000000000p+127L,
+          make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  if (test__fixunsxfoi(
+          0x1.FFFFFFFFFFFFFFFCp+126L,
+          make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL, 0))))
+    return 1;
+
+  // Tests beyond 128-bit boundary
+  // 2^200
+  if (test__fixunsxfoi(0x1.0p+200L, (ou_int)1 << 200))
+    return 1;
+
+  // Near 256-bit boundary
+  if (test__fixunsxfoi(
+          0x1.FFFFFFFFFFFFFFFEp+255L,
+          make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0x0000000000000000LL),
+                  make_ti(0, 0))))
+    return 1;
+
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixxfoi_test.c b/compiler-rt/test/builtins/Unit/fixxfoi_test.c
new file mode 100644
index 0000000000000..78c59ffe3243b
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixxfoi_test.c
@@ -0,0 +1,144 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixxfoi
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a signed 256-bit integer, rounding toward zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes
+//             oi_int is a 256 bit integral type
+//             value in long double is representable in oi_int (no range
+//             checking performed)
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI oi_int __fixxfoi(long double a);
+
+int test__fixxfoi(long double a, oi_int expected) {
+  oi_int x = __fixxfoi(a);
+  if (x != expected) {
+    printf("error in __fixxfoi(%LA)\n", a);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+  if (test__fixxfoi(0.0, 0))
+    return 1;
+
+  if (test__fixxfoi(0.5, 0))
+    return 1;
+  if (test__fixxfoi(0.99, 0))
+    return 1;
+  if (test__fixxfoi(1.0, 1))
+    return 1;
+  if (test__fixxfoi(1.5, 1))
+    return 1;
+  if (test__fixxfoi(1.99, 1))
+    return 1;
+  if (test__fixxfoi(2.0, 2))
+    return 1;
+  if (test__fixxfoi(2.01, 2))
+    return 1;
+  if (test__fixxfoi(-0.5, 0))
+    return 1;
+  if (test__fixxfoi(-0.99, 0))
+    return 1;
+  if (test__fixxfoi(-1.0, -1))
+    return 1;
+  if (test__fixxfoi(-1.5, -1))
+    return 1;
+  if (test__fixxfoi(-1.99, -1))
+    return 1;
+  if (test__fixxfoi(-2.0, -2))
+    return 1;
+  if (test__fixxfoi(-2.01, -2))
+    return 1;
+
+  // Float precision boundary tests (from 128-bit reference)
+  if (test__fixxfoi(0x1.FFFFFEp+62, 0x7FFFFF8000000000LL))
+    return 1;
+  if (test__fixxfoi(0x1.FFFFFCp+62, 0x7FFFFF0000000000LL))
+    return 1;
+
+  if (test__fixxfoi(-0x1.FFFFFEp+62, -(oi_int)0x7FFFFF8000000000LL))
+    return 1;
+  if (test__fixxfoi(-0x1.FFFFFCp+62, -(oi_int)0x7FFFFF0000000000LL))
+    return 1;
+
+  // Double precision boundary tests
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFp+62, 0x7FFFFFFFFFFFFC00LL))
+    return 1;
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFEp+62, 0x7FFFFFFFFFFFF800LL))
+    return 1;
+
+  if (test__fixxfoi(-0x1.FFFFFFFFFFFFFp+62, -(oi_int)0x7FFFFFFFFFFFFC00LL))
+    return 1;
+  if (test__fixxfoi(-0x1.FFFFFFFFFFFFEp+62, -(oi_int)0x7FFFFFFFFFFFF800LL))
+    return 1;
+
+  // Long double (80-bit) full precision tests
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFCp+62L, 0x7FFFFFFFFFFFFFFFLL))
+    return 1;
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFFF8p+62L, 0x7FFFFFFFFFFFFFFELL))
+    return 1;
+
+  if (test__fixxfoi(-0x1.0000000000000000p+63L, -(oi_int)0x8000000000000000LL))
+    return 1;
+  if (test__fixxfoi(-0x1.FFFFFFFFFFFFFFFCp+62L, -(oi_int)0x7FFFFFFFFFFFFFFFLL))
+    return 1;
+  if (test__fixxfoi(-0x1.FFFFFFFFFFFFFFF8p+62L, -(oi_int)0x7FFFFFFFFFFFFFFELL))
+    return 1;
+
+  // Tests at 128-bit boundary (same as ti tests, but still fits in oi)
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFEp+126L,
+                    make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL,
+                                                   0x8000000000000000LL))))
+    return 1;
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFCp+126L,
+                    make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL, 0))))
+    return 1;
+
+  if (test__fixxfoi(-0x1.0000000000000000p+127L,
+                    -make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+
+  // Tests beyond 128-bit boundary: values needing >128 bits
+  // 2^200
+  if (test__fixxfoi(0x1.0p+200L, (oi_int)1 << 200))
+    return 1;
+  if (test__fixxfoi(-0x1.0p+200L, -((oi_int)1 << 200)))
+    return 1;
+
+  // Value near 256-bit boundary
+  // 0x1.FFFFFFFFFFFFFFFEp+254L is the largest xf value that fits in oi_int
+  if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFEp+254L,
+                    make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, 0x8000000000000000LL),
+                            make_ti(0, 0))))
+    return 1;
+  if (test__fixxfoi(
+          -0x1.FFFFFFFFFFFFFFFEp+254L,
+          -make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, 0x8000000000000000LL),
+                   make_ti(0, 0))))
+    return 1;
+
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoidf_test.c b/compiler-rt/test/builtins/Unit/floatoidf_test.c
new file mode 100644
index 0000000000000..7b147e5ce69a3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoidf_test.c
@@ -0,0 +1,89 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoidf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI double __floatoidf(oi_int a);
+
+int test__floatoidf(oi_int a, double expected) {
+  double x = __floatoidf(a);
+  if (x != expected) {
+    printf("error in __floatoidf: got %f, expected %f\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__floatoidf((oi_int)0, 0.0))
+    return 1;
+  if (test__floatoidf((oi_int)1, 1.0))
+    return 1;
+  if (test__floatoidf((oi_int)-1, -1.0))
+    return 1;
+  if (test__floatoidf((oi_int)42, 42.0))
+    return 1;
+  if (test__floatoidf((oi_int)-42, -42.0))
+    return 1;
+  if (test__floatoidf((oi_int)1000000, 1e6))
+    return 1;
+  if (test__floatoidf((oi_int)-1000000, -1e6))
+    return 1;
+  if (test__floatoidf((oi_int)100, 100.0))
+    return 1;
+  if (test__floatoidf((oi_int)20, 20.0))
+    return 1;
+  if (test__floatoidf((oi_int)-20, -20.0))
+    return 1;
+  // Double mantissa boundary: 52 bits (53 with implicit 1)
+  // 2^53 = 9007199254740992, exactly representable
+  if (test__floatoidf((oi_int)9007199254740992LL, 9007199254740992.0))
+    return 1;
+  // 2^53 + 1: NOT exactly representable, rounds to 2^53
+  if (test__floatoidf((oi_int)9007199254740993LL, 9007199254740992.0))
+    return 1;
+  // 2^53 + 2: exactly representable
+  if (test__floatoidf((oi_int)9007199254740994LL, 9007199254740994.0))
+    return 1;
+  // Specific values from 128-bit reference tests
+  if (test__floatoidf((oi_int)0x7FFFFF8000000000LL, 0x1.FFFFFEp+62))
+    return 1;
+  if (test__floatoidf((oi_int)0x7FFFFFFFFFFFF800LL, 0x1.FFFFFFFFFFFFEp+62))
+    return 1;
+  // Large values spanning >64 bits
+  if (test__floatoidf((oi_int)1 << 64, 0x1.0p+64))
+    return 1;
+  if (test__floatoidf((oi_int)1 << 127, 0x1.0p+127))
+    return 1;
+  if (test__floatoidf(-((oi_int)1 << 127), -0x1.0p+127))
+    return 1;
+  // Very large value: 2^200
+  if (test__floatoidf((oi_int)1 << 200, 0x1.0p+200))
+    return 1;
+  // Values with high-half mantissa bits:
+  // make_oi(make_ti(0x7FFFFF8000000000, 0), make_ti(0, 0))
+  // = 0x7FFFFF8000000000 << 128, leading 1 at bit 254
+  if (test__floatoidf(make_oi(make_ti(0x7FFFFF8000000000LL, 0), make_ti(0, 0)),
+                      0x1.FFFFFEp+254))
+    return 1;
+  // Negative large
+  if (test__floatoidf(make_oi(make_ti(0x8000008000000000LL, 0), make_ti(0, 0)),
+                      -0x1.FFFFFEp+254))
+    return 1;
+  // Specific hex value (adapted from 128-bit reference)
+  if (test__floatoidf((oi_int)0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoisf_test.c b/compiler-rt/test/builtins/Unit/floatoisf_test.c
new file mode 100644
index 0000000000000..ea33058344892
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoisf_test.c
@@ -0,0 +1,77 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoisf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI float __floatoisf(oi_int a);
+
+int test__floatoisf(oi_int a, float expected) {
+  float x = __floatoisf(a);
+  if (x != expected) {
+    printf("error in __floatoisf: got %f, expected %f\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__floatoisf((oi_int)0, 0.0f))
+    return 1;
+  if (test__floatoisf((oi_int)1, 1.0f))
+    return 1;
+  if (test__floatoisf((oi_int)-1, -1.0f))
+    return 1;
+  if (test__floatoisf((oi_int)42, 42.0f))
+    return 1;
+  if (test__floatoisf((oi_int)-42, -42.0f))
+    return 1;
+  if (test__floatoisf((oi_int)100, 100.0f))
+    return 1;
+  if (test__floatoisf((oi_int)-100, -100.0f))
+    return 1;
+  if (test__floatoisf((oi_int)1000000, 1e6f))
+    return 1;
+  if (test__floatoisf((oi_int)-1000000, -1e6f))
+    return 1;
+  if (test__floatoisf((oi_int)20, 20.0f))
+    return 1;
+  if (test__floatoisf((oi_int)-20, -20.0f))
+    return 1;
+  // Precision boundary: float has 23 mantissa bits (24 with implicit 1)
+  // 2^24 = 16777216, exactly representable
+  if (test__floatoisf((oi_int)16777216, 16777216.0f))
+    return 1;
+  // 2^24 + 1 = 16777217: NOT exactly representable in float,
+  // rounds to 16777216.0f
+  if (test__floatoisf((oi_int)16777217, 16777216.0f))
+    return 1;
+  // 2^24 + 2 = 16777218: exactly representable (even, rounds-to-even)
+  if (test__floatoisf((oi_int)16777218, 16777218.0f))
+    return 1;
+  // Values at the mantissa boundary:
+  // 0x7FFFFF8000000000 = mantissa all-ones shifted to bit 62
+  if (test__floatoisf((oi_int)0x7FFFFF8000000000LL, 0x1.FFFFFEp+62F))
+    return 1;
+  // Large 256-bit value: 2^127
+  if (test__floatoisf((oi_int)1 << 127, 0x1.0p+127F))
+    return 1;
+  // Large negative
+  if (test__floatoisf(-((oi_int)1 << 127), -0x1.0p+127F))
+    return 1;
+  // Value > 128 bits: 2^200 exceeds float range, should return +inf
+  if (test__floatoisf((oi_int)1 << 200, __builtin_inff()))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoitf_test.c b/compiler-rt/test/builtins/Unit/floatoitf_test.c
new file mode 100644
index 0000000000000..db95716158f02
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoitf_test.c
@@ -0,0 +1,45 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoitf
+// REQUIRES: int256
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+
+COMPILER_RT_ABI fp_t __floatoitf(oi_int a);
+
+int test__floatoitf(oi_int a, fp_t expected) {
+  fp_t x = __floatoitf(a);
+  if (x != expected) {
+    printf("error in __floatoitf\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+  if (test__floatoitf((oi_int)0, TF_C(0.0)))
+    return 1;
+  if (test__floatoitf((oi_int)1, TF_C(1.0)))
+    return 1;
+  if (test__floatoitf((oi_int)-1, TF_C(-1.0)))
+    return 1;
+  if (test__floatoitf((oi_int)42, TF_C(42.0)))
+    return 1;
+  if (test__floatoitf((oi_int)-42, TF_C(-42.0)))
+    return 1;
+  if (test__floatoitf((oi_int)1000000, TF_C(1e6)))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoixf_test.c b/compiler-rt/test/builtins/Unit/floatoixf_test.c
new file mode 100644
index 0000000000000..f6dde67a9f9fe
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoixf_test.c
@@ -0,0 +1,114 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoixf
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <float.h>
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits
+//             oi_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI long double __floatoixf(oi_int a);
+
+int test__floatoixf(oi_int a, long double expected) {
+  long double x = __floatoixf(a);
+  if (x != expected) {
+    printf("error in __floatoixf = %LA, expected %LA\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+  if (test__floatoixf(0, 0.0))
+    return 1;
+
+  if (test__floatoixf(1, 1.0))
+    return 1;
+  if (test__floatoixf(2, 2.0))
+    return 1;
+  if (test__floatoixf(20, 20.0))
+    return 1;
+  if (test__floatoixf(-1, -1.0))
+    return 1;
+  if (test__floatoixf(-2, -2.0))
+    return 1;
+  if (test__floatoixf(-20, -20.0))
+    return 1;
+
+  // Precision boundary tests (from 128-bit reference)
+  if (test__floatoixf(0x7FFFFF8000000000LL, 0x1.FFFFFEp+62))
+    return 1;
+  if (test__floatoixf(0x7FFFFFFFFFFFF800LL, 0x1.FFFFFFFFFFFFEp+62))
+    return 1;
+  if (test__floatoixf(0x7FFFFF0000000000LL, 0x1.FFFFFCp+62))
+    return 1;
+  if (test__floatoixf(0x7FFFFFFFFFFFF000LL, 0x1.FFFFFFFFFFFFCp+62))
+    return 1;
+
+  // Full long double precision (64-bit mantissa)
+  if (test__floatoixf(0x7FFFFFFFFFFFFFFFLL, 0xF.FFFFFFFFFFFFFFEp+59L))
+    return 1;
+  if (test__floatoixf(0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+    return 1;
+  if (test__floatoixf(0x023479FD0E092DA1LL, 0x1.1A3CFE870496D08p+57L))
+    return 1;
+
+  // Values spanning >64 bits (128-bit range, in oi_int)
+  if (test__floatoixf(make_oi(make_ti(0, 0), make_ti(0x023479FD0E092DC0LL, 0)),
+                      0x1.1A3CFE870496Ep+121L))
+    return 1;
+
+  // Negative values
+  if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+                              make_ti(0x8000008000000000LL, 0)),
+                      -0x1.FFFFFEp+126))
+    return 1;
+  if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+                              make_ti(0x8000000000000000LL, 0)),
+                      -0x1.000000p+127))
+    return 1;
+  if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+                              make_ti(0x8000000000000001LL, 0)),
+                      -0x1.FFFFFFFFFFFFFFFCp+126L))
+    return 1;
+
+  // Values beyond 128-bit range: high half set
+  if (test__floatoixf(make_oi(make_ti(0, 1), make_ti(0, 0)), 0x1.0p+128L))
+    return 1;
+  // 2^200
+  if (test__floatoixf((oi_int)1 << 200, 0x1.0p+200L))
+    return 1;
+
+  // Large 256-bit value near max
+  if (test__floatoixf(make_oi(make_ti(0x023479FD0E092DC0LL, 0), make_ti(0, 0)),
+                      0x1.1A3CFE870496Ep+249L))
+    return 1;
+
+  // Max unsigned 64-bit in lower half
+  if (test__floatoixf(make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFLL)),
+                      0x1.FFFFFFFFFFFFFFFEp+63L))
+    return 1;
+
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoidf_test.c b/compiler-rt/test/builtins/Unit/floatunoidf_test.c
new file mode 100644
index 0000000000000..b822b611bcccc
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoidf_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoidf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI double __floatunoidf(ou_int a);
+
+int test__floatunoidf(ou_int a, double expected) {
+  double x = __floatunoidf(a);
+  if (x != expected) {
+    printf("error in __floatunoidf: got %f, expected %f\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__floatunoidf((ou_int)0, 0.0))
+    return 1;
+  if (test__floatunoidf((ou_int)1, 1.0))
+    return 1;
+  if (test__floatunoidf((ou_int)42, 42.0))
+    return 1;
+  if (test__floatunoidf((ou_int)1000000, 1e6))
+    return 1;
+  if (test__floatunoidf((ou_int)1000000000000000000ULL, 1e18))
+    return 1;
+  if (test__floatunoidf((ou_int)100, 100.0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoisf_test.c b/compiler-rt/test/builtins/Unit/floatunoisf_test.c
new file mode 100644
index 0000000000000..6be53202d0e26
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoisf_test.c
@@ -0,0 +1,41 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoisf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI float __floatunoisf(ou_int a);
+
+int test__floatunoisf(ou_int a, float expected) {
+  float x = __floatunoisf(a);
+  if (x != expected) {
+    printf("error in __floatunoisf: got %f, expected %f\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__floatunoisf((ou_int)0, 0.0f))
+    return 1;
+  if (test__floatunoisf((ou_int)1, 1.0f))
+    return 1;
+  if (test__floatunoisf((ou_int)42, 42.0f))
+    return 1;
+  if (test__floatunoisf((ou_int)100, 100.0f))
+    return 1;
+  if (test__floatunoisf((ou_int)1000000, 1e6f))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoitf_test.c b/compiler-rt/test/builtins/Unit/floatunoitf_test.c
new file mode 100644
index 0000000000000..0407235b2edcb
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoitf_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoitf
+// REQUIRES: int256
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+
+COMPILER_RT_ABI fp_t __floatunoitf(ou_int a);
+
+int test__floatunoitf(ou_int a, fp_t expected) {
+  fp_t x = __floatunoitf(a);
+  if (x != expected) {
+    printf("error in __floatunoitf\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+  if (test__floatunoitf((ou_int)0, TF_C(0.0)))
+    return 1;
+  if (test__floatunoitf((ou_int)1, TF_C(1.0)))
+    return 1;
+  if (test__floatunoitf((ou_int)42, TF_C(42.0)))
+    return 1;
+  if (test__floatunoitf((ou_int)1000000, TF_C(1e6)))
+    return 1;
+  if (test__floatunoitf((ou_int)1000000000000000000ULL, TF_C(1e18)))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoixf_test.c b/compiler-rt/test/builtins/Unit/floatunoixf_test.c
new file mode 100644
index 0000000000000..402330b3525bf
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoixf_test.c
@@ -0,0 +1,123 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoixf
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <float.h>
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits
+//             ou_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI long double __floatunoixf(ou_int a);
+
+int test__floatunoixf(ou_int a, long double expected) {
+  long double x = __floatunoixf(a);
+  if (x != expected) {
+    printf("error in __floatunoixf = %LA, expected %LA\n", x, expected);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(ou_int) == 2 * sizeof(tu_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+  if (test__floatunoixf(0, 0.0))
+    return 1;
+
+  if (test__floatunoixf(1, 1.0))
+    return 1;
+  if (test__floatunoixf(2, 2.0))
+    return 1;
+  if (test__floatunoixf(20, 20.0))
+    return 1;
+
+  // Precision boundary tests
+  if (test__floatunoixf(0x7FFFFF8000000000ULL, 0x1.FFFFFEp+62))
+    return 1;
+  if (test__floatunoixf(0x7FFFFFFFFFFFF800ULL, 0x1.FFFFFFFFFFFFEp+62))
+    return 1;
+  if (test__floatunoixf(0x7FFFFF0000000000ULL, 0x1.FFFFFCp+62))
+    return 1;
+  if (test__floatunoixf(0x7FFFFFFFFFFFF000ULL, 0x1.FFFFFFFFFFFFCp+62))
+    return 1;
+  if (test__floatunoixf(0x7FFFFFFFFFFFFFFFULL, 0xF.FFFFFFFFFFFFFFEp+59L))
+    return 1;
+  if (test__floatunoixf(0xFFFFFFFFFFFFFFFEULL, 0xF.FFFFFFFFFFFFFFEp+60L))
+    return 1;
+  if (test__floatunoixf(0xFFFFFFFFFFFFFFFFULL, 0xF.FFFFFFFFFFFFFFFp+60L))
+    return 1;
+
+  // Specific hex value tests
+  if (test__floatunoixf(0x8000008000000000ULL, 0x8.000008p+60))
+    return 1;
+  if (test__floatunoixf(0x8000000000000800ULL, 0x8.0000000000008p+60))
+    return 1;
+  if (test__floatunoixf(0x8000000000000000ULL, 0x8p+60))
+    return 1;
+  if (test__floatunoixf(0x8000000000000001ULL, 0x8.000000000000001p+60L))
+    return 1;
+
+  if (test__floatunoixf(0x0007FB72E8000000LL, 0x1.FEDCBAp+50))
+    return 1;
+  if (test__floatunoixf(0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+    return 1;
+  if (test__floatunoixf(0x023479FD0E092DA1LL, 0x1.1A3CFE870496D08p+57L))
+    return 1;
+
+  // Values spanning >64 bits (128-bit range, in ou_int)
+  if (test__floatunoixf(
+          make_oi(make_ti(0, 0), make_ti(0x023479FD0E092DC0LL, 0)),
+          0x1.1A3CFE870496Ep+121L))
+    return 1;
+
+  // Max unsigned 128-bit value in lower half
+  if (test__floatunoixf(make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL,
+                                                       0xFFFFFFFFFFFFFFFFLL)),
+                        0x1.0000000000000000p+128L))
+    return 1;
+  if (test__floatunoixf(
+          make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL, 0)),
+          0x1.FFFFFFFFFFFFFFFEp+127L))
+    return 1;
+
+  // Values beyond 128-bit range: high half set
+  if (test__floatunoixf(make_oi(make_ti(0, 1), make_ti(0, 0)), 0x1.0p+128L))
+    return 1;
+  // 2^200
+  if (test__floatunoixf((ou_int)1 << 200, 0x1.0p+200L))
+    return 1;
+
+  // Large 256-bit value near max
+  if (test__floatunoixf(
+          make_oi(make_ti(0x023479FD0E092DC0LL, 0), make_ti(0, 0)),
+          0x1.1A3CFE870496Ep+249L))
+    return 1;
+
+  // Max 256-bit unsigned value
+  if (test__floatunoixf(
+          make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL),
+                  make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL)),
+          0x1.0000000000000000p+256L))
+    return 1;
+
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py
index 59da054848f3c..48e07ec43e8ff 100644
--- a/compiler-rt/test/builtins/Unit/lit.cfg.py
+++ b/compiler-rt/test/builtins/Unit/lit.cfg.py
@@ -165,6 +165,17 @@ def get_libgcc_file_name():
 if not builtins_is_msvc:
     config.available_features.add("int128")
 
+# Check if __int256 is supported by the target compiler
+import subprocess
+
+int256_check = subprocess.run(
+    [config.clang.strip(), "-x", "c", "-c", "-o", "/dev/null", "-"],
+    input=b"__int256_t x;",
+    capture_output=True,
+)
+if int256_check.returncode == 0:
+    config.available_features.add("int256")
+
 clang_wrapper = ""
 
 
diff --git a/compiler-rt/test/builtins/Unit/lshroi3_test.c b/compiler-rt/test/builtins/Unit/lshroi3_test.c
new file mode 100644
index 0000000000000..fde52dda538ad
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/lshroi3_test.c
@@ -0,0 +1,101 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_lshroi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __lshroi3(oi_int a, int b);
+
+int test__lshroi3(oi_int a, int b, oi_int expected) {
+  oi_int x = __lshroi3(a, b);
+  if (x != expected) {
+    printf("error in __lshroi3: shift by %d\n", b);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Shift by 0
+  if (test__lshroi3((oi_int)1, 0, (oi_int)1))
+    return 1;
+  // Shift by 1
+  if (test__lshroi3((oi_int)2, 1, (oi_int)1))
+    return 1;
+  // Logical shift negative by 1 (no sign extension)
+  if (test__lshroi3(
+          (oi_int)-1, 1,
+          make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1))))
+    return 1;
+  // Shift by 63
+  if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+                    63, (oi_int)1))
+    return 1;
+  // Shift by 64
+  if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(1, 0)), 64, (oi_int)1))
+    return 1;
+  // Shift by 127
+  if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                    127, (oi_int)1))
+    return 1;
+  // Shift by 128
+  if (test__lshroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 128, (oi_int)1))
+    return 1;
+  // Shift by 129
+  if (test__lshroi3(make_oi(make_ti(0, 2), make_ti(0, 0)), 129, (oi_int)1))
+    return 1;
+  // Shift by 191
+  if (test__lshroi3(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+                    191, (oi_int)1))
+    return 1;
+  // Shift by 192
+  if (test__lshroi3(make_oi(make_ti(0xABCDLL, 0), make_ti(0, 0)), 192,
+                    (oi_int)0xABCDLL))
+    return 1;
+  // Shift all-ones by 255
+  if (test__lshroi3((oi_int)(ou_int)-1, 255, (oi_int)1))
+    return 1;
+  // Multi-bit value shift by 64
+  if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFULL, 0)),
+                    64,
+                    make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL))))
+    return 1;
+  // Multi-bit value shift by 128
+  if (test__lshroi3(make_oi(make_ti(0, 0xFFFFFFFFFFFFFFFFULL), make_ti(0, 0)),
+                    128,
+                    make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL))))
+    return 1;
+  // Shift that spans both halves
+  if (test__lshroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 1,
+                    make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+    return 1;
+  // Full value shift by 0 (identity)
+  if (test__lshroi3(
+          make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+                  make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL)),
+          0,
+          make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+                  make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL))))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated, shift crosses 64-bit boundary).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__lshroi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          73,
+          make_oi(make_ti(0x0000000000000000LL, 0x0055555DDDE6666EULL),
+                  make_ti(0xEEF7777FFF888891ULL, 0x111999A2222AAAB3ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/modoi3_test.c b/compiler-rt/test/builtins/Unit/modoi3_test.c
new file mode 100644
index 0000000000000..f7969cbafa407
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/modoi3_test.c
@@ -0,0 +1,82 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_modoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __modoi3(oi_int a, oi_int b);
+
+int test__modoi3(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __modoi3(a, b);
+  if (x != expected) {
+    printf("error in __modoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__modoi3((oi_int)0, (oi_int)1, (oi_int)0))
+    return 1;
+  if (test__modoi3((oi_int)10, (oi_int)3, (oi_int)1))
+    return 1;
+  if (test__modoi3((oi_int)-10, (oi_int)3, (oi_int)-1))
+    return 1;
+  if (test__modoi3((oi_int)10, (oi_int)-3, (oi_int)1))
+    return 1;
+  if (test__modoi3((oi_int)-10, (oi_int)-3, (oi_int)-1))
+    return 1;
+  if (test__modoi3((oi_int)100, (oi_int)7, (oi_int)2))
+    return 1;
+  // Exact division has zero remainder
+  if (test__modoi3((oi_int)42, (oi_int)42, (oi_int)0))
+    return 1;
+  // Dividend smaller than divisor
+  if (test__modoi3((oi_int)3, (oi_int)10, (oi_int)3))
+    return 1;
+  if (test__modoi3((oi_int)-3, (oi_int)10, (oi_int)-3))
+    return 1;
+  // Large value in high half mod small
+  // (1 << 128) % 3 = 1 (since 2^128 mod 3 = 1)
+  if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3, (oi_int)1))
+    return 1;
+  // (1 << 128) % 2 = 0
+  if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2, (oi_int)0))
+    return 1;
+  // Negative large value mod
+  if (test__modoi3(make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)3,
+                   (oi_int)-1))
+    return 1;
+  // Cross-half boundary value mod small
+  if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 5)), (oi_int)4, (oi_int)1))
+    return 1;
+  // Large mod large (same value)
+  {
+    oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+    if (test__modoi3(big, big, (oi_int)0))
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // A(signed) % B(signed), verified by Python: q*b + r == a.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__modoi3(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0xEEEF44449999EEEFLL, 0x44449998EEEF4444ULL),
+                  make_ti(0x9999EEEF44449999ULL, 0xEEEF444499954444ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/muloi5_test.c b/compiler-rt/test/builtins/Unit/muloi5_test.c
new file mode 100644
index 0000000000000..9f42881b43d28
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/muloi5_test.c
@@ -0,0 +1,164 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_muloi5
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __muloi5(oi_int a, oi_int b, int *overflow);
+
+int test__muloi5(oi_int a, oi_int b, oi_int expected, int expected_overflow) {
+  int overflow;
+  oi_int x = __muloi5(a, b, &overflow);
+  if (overflow != expected_overflow || (!expected_overflow && x != expected)) {
+    printf("error in __muloi5: overflow=%d (expected %d)\n", overflow,
+           expected_overflow);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__muloi5((oi_int)0, (oi_int)0, (oi_int)0, 0))
+    return 1;
+  if (test__muloi5((oi_int)1, (oi_int)1, (oi_int)1, 0))
+    return 1;
+  if (test__muloi5((oi_int)2, (oi_int)3, (oi_int)6, 0))
+    return 1;
+  if (test__muloi5((oi_int)-1, (oi_int)1, (oi_int)-1, 0))
+    return 1;
+  if (test__muloi5((oi_int)-1, (oi_int)-1, (oi_int)1, 0))
+    return 1;
+  // Large * 0 = 0, no overflow
+  if (test__muloi5(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                   (oi_int)0, (oi_int)0, 0))
+    return 1;
+  // 0 * large = 0, no overflow
+  if (test__muloi5((oi_int)0,
+                   make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                   (oi_int)0, 0))
+    return 1;
+  // Cross-half multiplication without overflow
+  // (1 << 64) * (1 << 64) = (1 << 128)
+  if (test__muloi5(make_oi(make_ti(0, 0), make_ti(1, 0)),
+                   make_oi(make_ti(0, 0), make_ti(1, 0)),
+                   make_oi(make_ti(0, 1), make_ti(0, 0)), 0))
+    return 1;
+  // (1 << 127) * 2 = (1 << 128), no overflow
+  if (test__muloi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                   (oi_int)2, make_oi(make_ti(0, 1), make_ti(0, 0)), 0))
+    return 1;
+  // MAX * 1 = MAX, no overflow
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    if (test__muloi5(MAX, (oi_int)1, MAX, 0))
+      return 1;
+  }
+  // MAX * 2 overflows
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    if (test__muloi5(MAX, (oi_int)2, (oi_int)0, 1))
+      return 1;
+  }
+  // MIN * -1 overflows
+  {
+    oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+    if (test__muloi5(MIN, (oi_int)-1, (oi_int)0, 1))
+      return 1;
+  }
+  // MIN * 1 = MIN, no overflow
+  {
+    oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+    if (test__muloi5(MIN, (oi_int)1, MIN, 0))
+      return 1;
+  }
+  // (1 << 128) * (1 << 128) overflows (result would be 1 << 256)
+  if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                   make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)0, 1))
+    return 1;
+  // Negative * negative = positive, no overflow
+  if (test__muloi5((oi_int)-100, (oi_int)-200, (oi_int)20000, 0))
+    return 1;
+  // === Near-overflow boundary tests ===
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+    // MAX / 2 * 2 = MAX - 1 (since MAX is odd), no overflow
+    oi_int half_max = MAX >> 1; // = (MAX-1)/2
+    if (test__muloi5(
+            half_max, (oi_int)2,
+            make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -2)), 0))
+      return 1;
+    // (MAX/2 + 1) * 2 = MAX + 1, overflows
+    if (test__muloi5(half_max + 1, (oi_int)2, (oi_int)0, 1))
+      return 1;
+    // MIN / 2 * 2 = MIN, no overflow
+    oi_int half_min = MIN >> 1; // = MIN/2
+    if (test__muloi5(half_min, (oi_int)2, MIN, 0))
+      return 1;
+    // (MIN/2 - 1) * 2 = MIN - 2, overflows
+    if (test__muloi5(half_min - 1, (oi_int)2, (oi_int)0, 1))
+      return 1;
+    // MAX * -1 = -MAX (= MIN + 1), no overflow
+    if (test__muloi5(MAX, (oi_int)-1,
+                     make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1)),
+                     0))
+      return 1;
+    // MIN * 2 overflows
+    if (test__muloi5(MIN, (oi_int)2, (oi_int)0, 1))
+      return 1;
+    // MAX * -2 overflows
+    if (test__muloi5(MAX, (oi_int)-2, (oi_int)0, 1))
+      return 1;
+    // (1 << 127) * (1 << 127) = (1 << 254), no overflow
+    if (test__muloi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                     make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                     make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0)),
+                     0))
+      return 1;
+    // (1 << 128) * (1 << 126) = (1 << 254), no overflow
+    if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                     make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 0)),
+                     make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0)),
+                     0))
+      return 1;
+    // (1 << 128) * (1 << 127) = (1 << 255) overflows (== MIN as unsigned,
+    // but as signed this is negative and the operands are both positive)
+    if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                     make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                     (oi_int)0, 1))
+      return 1;
+  }
+  // === Commutativity check ===
+  {
+    int ov1, ov2;
+    oi_int a = make_oi(make_ti(0x12345678LL, 0), make_ti(0, 0xABCDEF01ULL));
+    oi_int b = make_oi(make_ti(0, 0), make_ti(0, 0xFEDCBA98ULL));
+    oi_int r1 = __muloi5(a, b, &ov1);
+    oi_int r2 = __muloi5(b, a, &ov2);
+    if (r1 != r2 || ov1 != ov2)
+      return 1;
+  }
+  // Full-width big-number multiplication (fits in 255 bits, no overflow).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__muloi5(
+          make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+                  make_ti(0x7766554433221100ULL, 0xFFEEDDCCBBAA9988ULL)),
+          make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+                  make_ti(0x0000000000000002ULL, 0x1111111111111111ULL)),
+          make_oi(make_ti(0x0000000000000000LL, 0xF6C26BF3589BBCBDULL),
+                  make_ti(0xC4B3A291806F5E4CULL, 0x3334579D048E3A08ULL)),
+          0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/multi5_test.c b/compiler-rt/test/builtins/Unit/multi5_test.c
new file mode 100644
index 0000000000000..f0c594c18d203
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/multi5_test.c
@@ -0,0 +1,174 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_multi5
+// REQUIRES: int256
+//
+// Tests for 256-bit multiplication (__multi5). The 128-bit equivalent
+// (multi3_test.c) has ~125 lines of hand-picked cases; this test matches that
+// approach and adds cases specifically targeting 256-bit partial product carry
+// propagation (4 x 128-bit partial products), commutativity, and squaring
+// identities.
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __multi5(oi_int a, oi_int b);
+
+int test__multi5(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __multi5(a, b);
+  if (x != expected) {
+    printf("error in __multi5\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // 0 * 0
+  if (test__multi5((oi_int)0, (oi_int)0, (oi_int)0))
+    return 1;
+  // 1 * 1
+  if (test__multi5((oi_int)1, (oi_int)1, (oi_int)1))
+    return 1;
+  // 2 * 3
+  if (test__multi5((oi_int)2, (oi_int)3, (oi_int)6))
+    return 1;
+  // -1 * 1
+  if (test__multi5((oi_int)-1, (oi_int)1, (oi_int)-1))
+    return 1;
+  // -1 * -1
+  if (test__multi5((oi_int)-1, (oi_int)-1, (oi_int)1))
+    return 1;
+  // Large * 0
+  if (test__multi5(make_oi(make_ti(0xFFFF, 0xFFFF), make_ti(0xFFFF, 0xFFFF)),
+                   (oi_int)0, (oi_int)0))
+    return 1;
+  // 0 * large
+  if (test__multi5((oi_int)0,
+                   make_oi(make_ti(0xFFFF, 0xFFFF), make_ti(0xFFFF, 0xFFFF)),
+                   (oi_int)0))
+    return 1;
+  // 0x10000 * 0x10000 = 0x100000000
+  if (test__multi5((oi_int)0x10000, (oi_int)0x10000, (oi_int)0x100000000LL))
+    return 1;
+  // Large value multiplication within low half
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(0, 0x100000000LL)),
+                   make_oi(make_ti(0, 0), make_ti(0, 0x100000000LL)),
+                   make_oi(make_ti(0, 0), make_ti(1, 0))))
+    return 1;
+  // Cross-half multiplication: low_half * small -> result in high half
+  // (1 << 64) * (1 << 64) = (1 << 128)
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(1, 0)),
+                   make_oi(make_ti(0, 0), make_ti(1, 0)),
+                   make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // (1 << 127) * 2 = (1 << 128)
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                   (oi_int)2, make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // Negative * positive with cross-half result
+  // -(1 << 64) * (1 << 64) = -(1 << 128)
+  if (test__multi5(make_oi(make_ti(-1, -1), make_ti(-1, 0)),
+                   make_oi(make_ti(0, 0), make_ti(1, 0)),
+                   make_oi(make_ti(-1, -1), make_ti(0, 0))))
+    return 1;
+  // Large * 1 = identity
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__multi5(big, (oi_int)1, big))
+      return 1;
+  }
+  // Large * -1 = negation
+  if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)-1,
+                   make_oi(make_ti(-1, -1), make_ti(0, 0))))
+    return 1;
+  // High half * small
+  if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+                   make_oi(make_ti(0, 3), make_ti(0, 0))))
+    return 1;
+  // Commutativity
+  if (test__multi5((oi_int)3, make_oi(make_ti(0, 1), make_ti(0, 0)),
+                   make_oi(make_ti(0, 3), make_ti(0, 0))))
+    return 1;
+  // (2^64 - 1) * (2^64 - 1) = 2^128 - 2^65 + 1
+  // Exercises partial product carry propagation across 64-bit boundaries.
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(0, -1)),
+                   make_oi(make_ti(0, 0), make_ti(0, -1)),
+                   make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFELL,
+                                                  0x0000000000000001LL))))
+    return 1;
+  // (2^128 - 1) * 3 = 3 * 2^128 - 3
+  // Cross-half multiplication with borrow from low half.
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(-1, -1)), (oi_int)3,
+                   make_oi(make_ti(0, 2), make_ti(-1, -3))))
+    return 1;
+  // Power-of-2 multiplication: (1 << 200) * (1 << 40) = (1 << 240)
+  if (test__multi5(make_oi(make_ti(0x100, 0), make_ti(0, 0)),
+                   make_oi(make_ti(0, 0), make_ti(0, 1LL << 40)),
+                   make_oi(make_ti(0x1000000000000LL, 0), make_ti(0, 0))))
+    return 1;
+  // (2^64 + 1) * 3 = 3 * 2^64 + 3 -- small cross-word carry
+  if (test__multi5(make_oi(make_ti(0, 0), make_ti(1, 1)), (oi_int)3,
+                   make_oi(make_ti(0, 0), make_ti(3, 3))))
+    return 1;
+  // (2^128 + 1) * (2^128 - 1) = 2^256 - 1 (wraps to -1 in signed)
+  if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 1)),
+                   make_oi(make_ti(0, 0), make_ti(-1, -1)), (oi_int)-1))
+    return 1;
+  // All-ones * all-ones = 1 (in modular arithmetic, (-1)*(-1) = 1)
+  if (test__multi5((oi_int)-1, (oi_int)-1, (oi_int)1))
+    return 1;
+  // === Large * large where all 4 partial products contribute ===
+  // a = (2^192 + 2^64 + 1), b = (2^192 + 2^64 + 1)
+  // a^2 = 2^384 + 2*2^256 + 2*2^192 + 2^128 + 2*2^64 + 1
+  // Mod 2^256: 2^193 + 2^128 + 2^65 + 1 (2^384 and 2*2^256 overflow away)
+  {
+    oi_int a = make_oi(make_ti(1, 0), make_ti(1, 1));
+    oi_int expected = make_oi(make_ti(2, 1), make_ti(2, 1));
+    if (test__multi5(a, a, expected))
+      return 1;
+  }
+  // Verify a * b == b * a for all partial product combinations
+  // a has bits set in all 4 64-bit words, b likewise
+  {
+    oi_int a = make_oi(make_ti(0xAAAAAAAA, 0xBBBBBBBB),
+                       make_ti(0xCCCCCCCC, 0xDDDDDDDD));
+    oi_int b = make_oi(make_ti(0x11111111, 0x22222222),
+                       make_ti(0x33333333, 0x44444444));
+    oi_int r1 = __multi5(a, b);
+    oi_int r2 = __multi5(b, a);
+    if (r1 != r2)
+      return 1;
+    // Also verify (a * b) / b == a (division is separately tested)
+  }
+  // Squaring: (2^128 - 1)^2 = 2^256 - 2^129 + 1
+  // Mod 2^256: -2^129 + 1 = -(2^129) + 1
+  {
+    oi_int a = make_oi(make_ti(0, 0), make_ti(-1, -1)); // 2^128 - 1
+    // Expected: 2^256 - 2^129 + 1 mod 2^256
+    // = 0xFFFF...FFFE 0000...0000 0000...0001
+    oi_int expected = make_oi(make_ti(-1, -2), make_ti(0, 1));
+    if (test__multi5(a, a, expected))
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__multi5(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0x0B609752EEEECDEFLL, 0xF01311110ECA71C7ULL),
+                  make_ti(0x06D389ABB60B47ADULL, 0xFA4F89AC5C290000ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/mulvoi3_test.c b/compiler-rt/test/builtins/Unit/mulvoi3_test.c
new file mode 100644
index 0000000000000..c9771c88f24d6
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/mulvoi3_test.c
@@ -0,0 +1,119 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_mulvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __mulvoi3(oi_int a, oi_int b);
+
+int test__mulvoi3(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __mulvoi3(a, b);
+  if (x != expected) {
+    printf("error in __mulvoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__mulvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+    return 1;
+  if (test__mulvoi3((oi_int)1, (oi_int)1, (oi_int)1))
+    return 1;
+  if (test__mulvoi3((oi_int)2, (oi_int)3, (oi_int)6))
+    return 1;
+  if (test__mulvoi3((oi_int)-1, (oi_int)1, (oi_int)-1))
+    return 1;
+  if (test__mulvoi3((oi_int)-1, (oi_int)-1, (oi_int)1))
+    return 1;
+  if (test__mulvoi3((oi_int)0x10000, (oi_int)0x10000, (oi_int)0x100000000LL))
+    return 1;
+  // Large * 0 = 0
+  if (test__mulvoi3(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                    (oi_int)0, (oi_int)0))
+    return 1;
+  // Cross-half multiplication: (1 << 64) * (1 << 64) = (1 << 128)
+  if (test__mulvoi3(make_oi(make_ti(0, 0), make_ti(1, 0)),
+                    make_oi(make_ti(0, 0), make_ti(1, 0)),
+                    make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // Negative * positive
+  if (test__mulvoi3((oi_int)-100, (oi_int)200, (oi_int)-20000))
+    return 1;
+  // Negative * negative
+  if (test__mulvoi3((oi_int)-100, (oi_int)-200, (oi_int)20000))
+    return 1;
+  // Large * 1 = identity
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__mulvoi3(big, (oi_int)1, big))
+      return 1;
+  }
+  // Note: overflow cases would abort, so we don't test them.
+  // Instead, we test the maximum non-overflowing products.
+
+  // MAX * 1 = MAX
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    if (test__mulvoi3(MAX, (oi_int)1, MAX))
+      return 1;
+  }
+  // MAX * -1 = -MAX (= MIN + 1)
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    oi_int NEG_MAX = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1));
+    if (test__mulvoi3(MAX, (oi_int)-1, NEG_MAX))
+      return 1;
+  }
+  // MIN * 1 = MIN
+  {
+    oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+    if (test__mulvoi3(MIN, (oi_int)1, MIN))
+      return 1;
+  }
+  // (MAX/2) * 2 = MAX - 1 (MAX is odd)
+  {
+    oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+    oi_int half = MAX >> 1;
+    oi_int expected =
+        make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -2));
+    if (test__mulvoi3(half, (oi_int)2, expected))
+      return 1;
+  }
+  // (1 << 127) * (1 << 127) = (1 << 254), near MAX but not overflow
+  if (test__mulvoi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                    make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+                    make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0))))
+    return 1;
+  // Commutativity
+  if (test__mulvoi3((oi_int)17, (oi_int)19, (oi_int)323))
+    return 1;
+  if (test__mulvoi3((oi_int)19, (oi_int)17, (oi_int)323))
+    return 1;
+  // Large negative * negative = positive
+  if (test__mulvoi3(make_oi(make_ti(-1, -1), make_ti(-1, -100)), (oi_int)-1,
+                    make_oi(make_ti(0, 0), make_ti(0, 100))))
+    return 1;
+  // Full-width big-number multiplication (fits in 255 bits, no overflow).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__mulvoi3(
+          make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+                  make_ti(0x7766554433221100ULL, 0xFFEEDDCCBBAA9988ULL)),
+          make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+                  make_ti(0x0000000000000002ULL, 0x1111111111111111ULL)),
+          make_oi(make_ti(0x0000000000000000LL, 0xF6C26BF3589BBCBDULL),
+                  make_ti(0xC4B3A291806F5E4CULL, 0x3334579D048E3A08ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/negoi2_test.c b/compiler-rt/test/builtins/Unit/negoi2_test.c
new file mode 100644
index 0000000000000..c51d7db210d9c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/negoi2_test.c
@@ -0,0 +1,69 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_negoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __negoi2(oi_int a);
+
+int test__negoi2(oi_int a, oi_int expected) {
+  oi_int x = __negoi2(a);
+  if (x != expected) {
+    printf("error in __negoi2\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__negoi2((oi_int)0, (oi_int)0))
+    return 1;
+  if (test__negoi2((oi_int)1, (oi_int)-1))
+    return 1;
+  if (test__negoi2((oi_int)-1, (oi_int)1))
+    return 1;
+  if (test__negoi2((oi_int)42, (oi_int)-42))
+    return 1;
+  if (test__negoi2((oi_int)-42, (oi_int)42))
+    return 1;
+  // Large value in high half
+  if (test__negoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                   make_oi(make_ti(-1, -1), make_ti(0, 0))))
+    return 1;
+  // Negate back
+  if (test__negoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)),
+                   make_oi(make_ti(0, 1), make_ti(0, 0))))
+    return 1;
+  // MAX
+  if (test__negoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                   make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1))))
+    return 1;
+  // Value with bits in low half only
+  if (test__negoi2(make_oi(make_ti(0, 0), make_ti(0, 1)),
+                   make_oi(make_ti(-1, -1), make_ti(-1, -1))))
+    return 1;
+  // Value spanning both halves
+  if (test__negoi2(make_oi(make_ti(0, 1), make_ti(0, 1)),
+                   make_oi(make_ti(-1, -2), make_ti(-1, -1))))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__negoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x5555444433332222LL, 0x11110000EEEEDDDDULL),
+                  make_ti(0xCCCCBBBBAAAA9999ULL, 0x8888777766670000ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/negvoi2_test.c b/compiler-rt/test/builtins/Unit/negvoi2_test.c
new file mode 100644
index 0000000000000..6e11f6a855284
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/negvoi2_test.c
@@ -0,0 +1,59 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_negvoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __negvoi2(oi_int a);
+
+int test__negvoi2(oi_int a, oi_int expected) {
+  oi_int x = __negvoi2(a);
+  if (x != expected) {
+    printf("error in __negvoi2\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__negvoi2((oi_int)0, (oi_int)0))
+    return 1;
+  if (test__negvoi2((oi_int)1, (oi_int)-1))
+    return 1;
+  if (test__negvoi2((oi_int)-1, (oi_int)1))
+    return 1;
+  if (test__negvoi2((oi_int)42, (oi_int)-42))
+    return 1;
+  if (test__negvoi2((oi_int)-42, (oi_int)42))
+    return 1;
+  // Large value in high half
+  if (test__negvoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                    make_oi(make_ti(-1, -1), make_ti(0, 0))))
+    return 1;
+  // MAX
+  if (test__negvoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+                    make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1))))
+    return 1;
+  // Note: MIN would abort, so we don't test it.
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  // C is negative signed; -C = |C|
+  if (test__negvoi2(
+          make_oi(make_ti(0xDDDDEEEEFFFF0000LL, 0x1111222233334444ULL),
+                  make_ti(0x5555666677778888ULL, 0x9999AAAABBBBCCCCULL)),
+          make_oi(make_ti(0x222211110000FFFFLL, 0xEEEEDDDDCCCCBBBBULL),
+                  make_ti(0xAAAA999988887777ULL, 0x6666555544443334ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/parityoi2_test.c b/compiler-rt/test/builtins/Unit/parityoi2_test.c
new file mode 100644
index 0000000000000..fd8094091240d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/parityoi2_test.c
@@ -0,0 +1,83 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_parityoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __parityoi2(oi_int a);
+
+int test__parityoi2(oi_int a, int expected) {
+  int x = __parityoi2(a);
+  if (x != expected) {
+    printf("error in __parityoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Zero (even parity)
+  if (test__parityoi2((oi_int)0, 0))
+    return 1;
+  // One (odd parity)
+  if (test__parityoi2((oi_int)1, 1))
+    return 1;
+  // Two bits set (even parity)
+  if (test__parityoi2((oi_int)3, 0))
+    return 1;
+  // Three bits set (odd parity)
+  if (test__parityoi2((oi_int)7, 1))
+    return 1;
+  // All ones = 256 bits set (even parity)
+  if (test__parityoi2((oi_int)(ou_int)-1, 0))
+    return 1;
+  // One bit in high half (odd parity)
+  if (test__parityoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 1))
+    return 1;
+  // One bit in each half (even parity)
+  if (test__parityoi2(make_oi(make_ti(0, 1), make_ti(0, 1)), 0))
+    return 1;
+  // High half all ones (128 bits = even), low half zero
+  if (test__parityoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)), 0))
+    return 1;
+  // 0xFF (8 bits = even parity)
+  if (test__parityoi2((oi_int)0xFF, 0))
+    return 1;
+  // 0x7F (7 bits = odd parity)
+  if (test__parityoi2((oi_int)0x7F, 1))
+    return 1;
+  // MSB only (odd parity)
+  if (test__parityoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+                      1))
+    return 1;
+  // One bit in each 64-bit word (4 bits = even parity)
+  if (test__parityoi2(make_oi(make_ti(1, 1), make_ti(1, 1)), 0))
+    return 1;
+  // Three bits across multiple words (odd parity)
+  if (test__parityoi2(make_oi(make_ti(1, 1), make_ti(1, 0)), 1))
+    return 1;
+  // Full-width big-number tests.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__parityoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          0))
+    return 1;
+  if (test__parityoi2(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          0))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/popcountoi2_test.c b/compiler-rt/test/builtins/Unit/popcountoi2_test.c
new file mode 100644
index 0000000000000..87b4237a0821d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/popcountoi2_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_popcountoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __popcountoi2(oi_int a);
+
+int test__popcountoi2(oi_int a, int expected) {
+  int x = __popcountoi2(a);
+  if (x != expected) {
+    printf("error in __popcountoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Zero
+  if (test__popcountoi2((oi_int)0, 0))
+    return 1;
+  // One
+  if (test__popcountoi2((oi_int)1, 1))
+    return 1;
+  // All ones (256 bits)
+  if (test__popcountoi2((oi_int)(ou_int)-1, 256))
+    return 1;
+  // 0xFF (8 bits set)
+  if (test__popcountoi2((oi_int)0xFF, 8))
+    return 1;
+  // One bit in each 128-bit half
+  if (test__popcountoi2(make_oi(make_ti(0, 1), make_ti(0, 1)), 2))
+    return 1;
+  // 0xFF in high half only
+  if (test__popcountoi2(make_oi(make_ti(0, 0xFF), make_ti(0, 0)), 8))
+    return 1;
+  // Alternating bits (0xAA...AA) = 128 bits set
+  if (test__popcountoi2(
+          make_oi(make_ti(0xAAAAAAAAAAAAAAAALL, 0xAAAAAAAAAAAAAAAALL),
+                  make_ti(0xAAAAAAAAAAAAAAAALL, 0xAAAAAAAAAAAAAAAALL)),
+          128))
+    return 1;
+  // Alternating bits (0x55...55) = 128 bits set
+  if (test__popcountoi2(
+          make_oi(make_ti(0x5555555555555555LL, 0x5555555555555555LL),
+                  make_ti(0x5555555555555555LL, 0x5555555555555555LL)),
+          128))
+    return 1;
+  // One bit in each 64-bit word (4 bits total)
+  if (test__popcountoi2(make_oi(make_ti(1, 1), make_ti(1, 1)), 4))
+    return 1;
+  // High half all ones, low half zero = 128
+  if (test__popcountoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)), 128))
+    return 1;
+  // Low half all ones, high half zero = 128
+  if (test__popcountoi2(make_oi(make_ti(0, 0), make_ti(-1, -1)), 128))
+    return 1;
+  // Single high bit = 1
+  if (test__popcountoi2(
+          make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)), 1))
+    return 1;
+  // Full-width big-number tests.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__popcountoi2(
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          128))
+    return 1;
+  if (test__popcountoi2(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          132))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/subvoi3_test.c b/compiler-rt/test/builtins/Unit/subvoi3_test.c
new file mode 100644
index 0000000000000..65969571454a3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/subvoi3_test.c
@@ -0,0 +1,81 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_subvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __subvoi3(oi_int a, oi_int b);
+
+int test__subvoi3(oi_int a, oi_int b, oi_int expected) {
+  oi_int x = __subvoi3(a, b);
+  if (x != expected) {
+    printf("error in __subvoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__subvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+    return 1;
+  if (test__subvoi3((oi_int)2, (oi_int)1, (oi_int)1))
+    return 1;
+  if (test__subvoi3((oi_int)0, (oi_int)1, (oi_int)-1))
+    return 1;
+  if (test__subvoi3((oi_int)300, (oi_int)200, (oi_int)100))
+    return 1;
+  // Negative result
+  if (test__subvoi3((oi_int)100, (oi_int)200, (oi_int)-100))
+    return 1;
+  // Negative - positive
+  if (test__subvoi3((oi_int)-100, (oi_int)200, (oi_int)-300))
+    return 1;
+  // Negative - negative
+  if (test__subvoi3((oi_int)-100, (oi_int)-200, (oi_int)100))
+    return 1;
+  // Borrow across 128-bit boundary (high half to low half)
+  if (test__subvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+                    make_oi(make_ti(0, 0), make_ti(0, 1)),
+                    make_oi(make_ti(0, 0), make_ti(-1, -1))))
+    return 1;
+  // Large values
+  if (test__subvoi3(make_oi(make_ti(0, 3), make_ti(0, 0)),
+                    make_oi(make_ti(0, 1), make_ti(0, 0)),
+                    make_oi(make_ti(0, 2), make_ti(0, 0))))
+    return 1;
+  // x - x = 0
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__subvoi3(big, big, (oi_int)0))
+      return 1;
+  }
+  // x - 0 = x
+  {
+    oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+    if (test__subvoi3(big, (oi_int)0, big))
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  // B(signed) - A(signed) = 0x66666666...66661111
+  if (test__subvoi3(
+          make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_oi(make_ti(0x6666666666666666LL, 0x6666666766666666ULL),
+                  make_ti(0x6666666666666666ULL, 0x6666666666661111ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ucmpoi2_test.c b/compiler-rt/test/builtins/Unit/ucmpoi2_test.c
new file mode 100644
index 0000000000000..5881f3b8c01e9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ucmpoi2_test.c
@@ -0,0 +1,89 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ucmpoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI si_int __ucmpoi2(ou_int a, ou_int b);
+
+int test__ucmpoi2(ou_int a, ou_int b, si_int expected) {
+  si_int x = __ucmpoi2(a, b);
+  if (x != expected) {
+    printf("error in __ucmpoi2: expected %d, got %d\n", expected, x);
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // Equal
+  if (test__ucmpoi2((ou_int)0, (ou_int)0, 1))
+    return 1;
+  if (test__ucmpoi2((ou_int)1, (ou_int)1, 1))
+    return 1;
+  // Less than
+  if (test__ucmpoi2((ou_int)0, (ou_int)1, 0))
+    return 1;
+  // Greater than
+  if (test__ucmpoi2((ou_int)1, (ou_int)0, 2))
+    return 1;
+  // All-ones is maximum unsigned
+  if (test__ucmpoi2((ou_int)-1, (ou_int)0, 2))
+    return 1;
+  if (test__ucmpoi2((ou_int)0, (ou_int)-1, 0))
+    return 1;
+  // High half comparison
+  if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 0)),
+                    make_ou(make_tu(0, 0), make_tu(-1, -1)), 2))
+    return 1;
+  // Large equal values
+  {
+    ou_int big = make_ou(make_tu(0x1234, 0x5678), make_tu(0x9ABC, 0xDEF0));
+    if (test__ucmpoi2(big, big, 1))
+      return 1;
+  }
+  // MAX > 0
+  if (test__ucmpoi2((ou_int)-1, (ou_int)0, 2))
+    return 1;
+  // Differ only in low half
+  if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 1)),
+                    make_ou(make_tu(0, 1), make_tu(0, 2)), 0))
+    return 1;
+  if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 2)),
+                    make_ou(make_tu(0, 1), make_tu(0, 1)), 2))
+    return 1;
+  // Differ only in highest 64-bit word
+  if (test__ucmpoi2(make_ou(make_tu(1, 0), make_tu(0, 0)),
+                    make_ou(make_tu(2, 0), make_tu(0, 0)), 0))
+    return 1;
+  if (test__ucmpoi2(make_ou(make_tu(2, 0), make_tu(0, 0)),
+                    make_ou(make_tu(1, 0), make_tu(0, 0)), 2))
+    return 1;
+  // Adjacent values
+  if (test__ucmpoi2((ou_int)100, (ou_int)101, 0))
+    return 1;
+  if (test__ucmpoi2((ou_int)101, (ou_int)100, 2))
+    return 1;
+  // Full-width big-number test (all 4 limbs populated).
+  // A > B unsigned.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__ucmpoi2(
+          make_ou(make_tu(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+                  make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_ou(make_tu(0x1111222233334444LL, 0x5555666677778888ULL),
+                  make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          2))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/udivmodoi4_test.c b/compiler-rt/test/builtins/Unit/udivmodoi4_test.c
new file mode 100644
index 0000000000000..f7482dae78fd9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/udivmodoi4_test.c
@@ -0,0 +1,272 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_udivmodoi4
+// REQUIRES: int256
+//
+// Testing strategy: The 128-bit equivalent (udivmodti4_test.c) uses a 65K-line
+// auto-generated exhaustive test vector file from the initial compiler-rt
+// import (no generator script available). Instead of replicating that approach
+// for 256-bit, this test uses:
+//   1. Hand-picked edge cases covering both code paths in the Knuth algorithm
+//      (Path 1: divisor fits in 128 bits, Path 2: divisor spans both halves)
+//   2. A 100-iteration pseudo-random invariant checker that verifies
+//      q * b + r == a and r < b for diverse LCG-generated inputs
+//   3. A divisor size sweep from 1-bit to 255-bit divisors
+// This catches the same class of bugs as exhaustive enumeration while being
+// maintainable and readable.
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem);
+
+int test__udivmodoi4(ou_int a, ou_int b, ou_int expected_q, ou_int expected_r) {
+  ou_int r;
+  ou_int q = __udivmodoi4(a, b, &r);
+  if (q != expected_q || r != expected_r) {
+    printf("error in __udivmodoi4\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  // 0 / 1
+  if (test__udivmodoi4((ou_int)0, (ou_int)1, (ou_int)0, (ou_int)0))
+    return 1;
+  // 1 / 1
+  if (test__udivmodoi4((ou_int)1, (ou_int)1, (ou_int)1, (ou_int)0))
+    return 1;
+  // 10 / 3
+  if (test__udivmodoi4((ou_int)10, (ou_int)3, (ou_int)3, (ou_int)1))
+    return 1;
+  // 100 / 7
+  if (test__udivmodoi4((ou_int)100, (ou_int)7, (ou_int)14, (ou_int)2))
+    return 1;
+  // Large value / small
+  if (test__udivmodoi4(
+          make_ou(make_tu(0, 0), make_tu(1, 0)), (ou_int)2,
+          make_ou(make_tu(0, 0), make_tu(0, 0x8000000000000000ULL)), (ou_int)0))
+    return 1;
+  // Dividend < divisor
+  if (test__udivmodoi4((ou_int)3, (ou_int)10, (ou_int)0, (ou_int)3))
+    return 1;
+  // Equal
+  if (test__udivmodoi4((ou_int)42, (ou_int)42, (ou_int)1, (ou_int)0))
+    return 1;
+  // Large divisor (both halves)
+  {
+    ou_int big = make_ou(make_tu(0, 1), make_tu(0, 0));
+    if (test__udivmodoi4(big, big, (ou_int)1, (ou_int)0))
+      return 1;
+  }
+  // (1 << 128) / 3 = quotient with remainder 1
+  if (test__udivmodoi4(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+                       make_ou(make_tu(0, 0), make_tu(0x5555555555555555ULL,
+                                                      0x5555555555555555ULL)),
+                       (ou_int)1))
+    return 1;
+  // All-ones / 2 = 0x7FFF...FFFF remainder 1
+  if (test__udivmodoi4(
+          (ou_int)-1, (ou_int)2,
+          make_ou(make_tu(0x7FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1)),
+          (ou_int)1))
+    return 1;
+  // Cross-half boundary: value spans both halves
+  if (test__udivmodoi4(
+          make_ou(make_tu(0, 1), make_tu(0, 5)), (ou_int)4,
+          make_ou(make_tu(0, 0), make_tu(0x4000000000000000ULL, 1)), (ou_int)1))
+    return 1;
+  // Large / large (double)
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+    if (test__udivmodoi4(dbl, big, (ou_int)2, (ou_int)0))
+      return 1;
+  }
+  // Very large divisor in high half
+  {
+    ou_int big = make_ou(make_tu(1, 0), make_tu(0, 0));
+    if (test__udivmodoi4(big, big, (ou_int)1, (ou_int)0))
+      return 1;
+  }
+  // Large value with remainder
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 7));
+    ou_int div = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    if (test__udivmodoi4(big, div, (ou_int)1, (ou_int)7))
+      return 1;
+  }
+  // Division by power of 2 vs equivalent shift: (1 << 192) / (1 << 64)
+  // = (1 << 128). Path 1: divisor.s.high == 0.
+  if (test__udivmodoi4(make_ou(make_tu(1, 0), make_tu(0, 0)),
+                       make_ou(make_tu(0, 0), make_tu(1, 0)),
+                       make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)0))
+    return 1;
+  // Path 1: Large dividend / medium 128-bit divisor.
+  // (2^192 + 2^64) / (2^64) = 2^128 + 1, remainder 0.
+  if (test__udivmodoi4(make_ou(make_tu(1, 0), make_tu(1, 0)),
+                       make_ou(make_tu(0, 0), make_tu(1, 0)),
+                       make_ou(make_tu(0, 1), make_tu(0, 1)), (ou_int)0))
+    return 1;
+  // Path 1: dividend.s.high >= divisor.s.low (needs two-step division).
+  // (3 * 2^128) / (2^128 - 1) = 3, remainder 3.
+  if (test__udivmodoi4(make_ou(make_tu(0, 3), make_tu(0, 0)),
+                       make_ou(make_tu(0, 0), make_tu(-1, -1)), (ou_int)3,
+                       (ou_int)3))
+    return 1;
+  // Path 2: Both halves set in divisor. Bit-by-bit division.
+  // (2^256 - 1) / (2^128 + 1) = 2^128 - 1, remainder 0.
+  if (test__udivmodoi4((ou_int)-1, make_ou(make_tu(0, 1), make_tu(0, 1)),
+                       make_ou(make_tu(0, 0), make_tu(-1, -1)), (ou_int)0))
+    return 1;
+  // Path 2: Large 256-bit divisor with remainder.
+  // (2^255) / (2^254 + 1): quotient = 1, remainder = 2^254 - 1.
+  {
+    ou_int dividend = make_ou(make_tu(0x8000000000000000ULL, 0), make_tu(0, 0));
+    ou_int divisor = make_ou(make_tu(0x4000000000000000ULL, 0), make_tu(0, 1));
+    ou_int exp_q = (ou_int)1;
+    ou_int exp_r = make_ou(make_tu(0x3FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1));
+    if (test__udivmodoi4(dividend, divisor, exp_q, exp_r))
+      return 1;
+  }
+  // Verify q * b + r == a invariant for a non-trivial case.
+  // a = 0xDEADBEEF12345678 (repeated), b = 0xCAFEBABE (fits in 128 bits).
+  {
+    ou_int a = make_ou(make_tu(0xDEADBEEF12345678ULL, 0xDEADBEEF12345678ULL),
+                       make_tu(0xDEADBEEF12345678ULL, 0xDEADBEEF12345678ULL));
+    ou_int b = (ou_int)0xCAFEBABEULL;
+    ou_int r;
+    ou_int q = __udivmodoi4(a, b, &r);
+    if (q * b + r != a)
+      return 1;
+    // Remainder must be less than divisor.
+    if (r >= b)
+      return 1;
+  }
+  // Verify q * b + r == a for a large divisor spanning both halves.
+  {
+    ou_int a = make_ou(make_tu(0xAAAAAAAAAAAAAAAAULL, 0xBBBBBBBBBBBBBBBBULL),
+                       make_tu(0xCCCCCCCCCCCCCCCCULL, 0xDDDDDDDDDDDDDDDDULL));
+    ou_int b = make_ou(make_tu(0, 0x1234567890ABCDEFULL),
+                       make_tu(0xFEDCBA0987654321ULL, 0x1111111111111111ULL));
+    ou_int r;
+    ou_int q = __udivmodoi4(a, b, &r);
+    if (q * b + r != a)
+      return 1;
+    if (r >= b)
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // A / B (unsigned): q = 9, r verified by Python: q*b + r == a.
+  // Expected values verified by Python arbitrary-precision arithmetic.
+  if (test__udivmodoi4(
+          make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+                  make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+                  make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          (ou_int)9,
+          make_ou(make_tu(0x11108887FFFF7776ULL, 0xEEEE6664DDDD5554ULL),
+                  make_tu(0xCCCC4443BBBB3332ULL, 0xAAAA222199A16667ULL))))
+    return 1;
+  // === Pseudo-random invariant checker ===
+  // Generate ~100 test vectors using a simple LCG and verify q * b + r == a
+  // and r < b for each. This catches systematic bugs in the Knuth algorithm
+  // that hand-picked cases might miss.
+  {
+    // LCG parameters (Numerical Recipes)
+    unsigned long long seed = 0xDEADBEEFCAFEBABEULL;
+    int failures = 0;
+    for (int i = 0; i < 100; ++i) {
+      // Generate pseudo-random a and b using LCG
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long w0 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long w1 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long w2 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long w3 = seed;
+      ou_int a = make_ou(make_tu(w3, w2), make_tu(w1, w0));
+
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long d0 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long d1 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long d2 = seed;
+      seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+      unsigned long long d3 = seed;
+      ou_int b = make_ou(make_tu(d3, d2), make_tu(d1, d0));
+      if (b == 0)
+        b = 1;
+
+      ou_int r;
+      ou_int q = __udivmodoi4(a, b, &r);
+      // Invariant: q * b + r == a
+      if (q * b + r != a) {
+        printf("invariant failure at i=%d: q * b + r != a\n", i);
+        failures++;
+      }
+      // Invariant: r < b
+      if (r >= b) {
+        printf("invariant failure at i=%d: r >= b\n", i);
+        failures++;
+      }
+    }
+    if (failures)
+      return 1;
+  }
+  // === Divisor size sweep ===
+  // Test with divisors of varying sizes: 1 bit, 32 bits, 64 bits,
+  // 128 bits, 192 bits, 255 bits. This exercises both Path 1
+  // (divisor.high == 0) and Path 2 (both halves).
+  {
+    ou_int dividend = (ou_int)-1; // max value
+    ou_int r;
+    ou_int q;
+    // 1-bit divisor
+    q = __udivmodoi4(dividend, (ou_int)1, &r);
+    if (q != dividend || r != 0)
+      return 1;
+    // 32-bit divisor
+    q = __udivmodoi4(dividend, (ou_int)0xFFFFFFFFULL, &r);
+    if (q * (ou_int)0xFFFFFFFFULL + r != dividend)
+      return 1;
+    // 64-bit divisor
+    q = __udivmodoi4(dividend, (ou_int)0xFFFFFFFFFFFFFFFFULL, &r);
+    if (q * (ou_int)0xFFFFFFFFFFFFFFFFULL + r != dividend)
+      return 1;
+    // 128-bit divisor (all ones in low half)
+    {
+      ou_int d128 = make_ou(make_tu(0, 0), make_tu(-1, -1));
+      q = __udivmodoi4(dividend, d128, &r);
+      if (q * d128 + r != dividend)
+        return 1;
+    }
+    // 192-bit divisor
+    {
+      ou_int d192 = make_ou(make_tu(0, -1), make_tu(-1, -1));
+      q = __udivmodoi4(dividend, d192, &r);
+      if (q * d192 + r != dividend)
+        return 1;
+    }
+    // 255-bit divisor (max >> 1)
+    {
+      ou_int d255 = (ou_int)-1 >> 1;
+      q = __udivmodoi4(dividend, d255, &r);
+      if (q * d255 + r != dividend)
+        return 1;
+    }
+  }
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/udivoi3_test.c b/compiler-rt/test/builtins/Unit/udivoi3_test.c
new file mode 100644
index 0000000000000..ffe90f5175611
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/udivoi3_test.c
@@ -0,0 +1,92 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_udivoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __udivoi3(ou_int a, ou_int b);
+
+int test__udivoi3(ou_int a, ou_int b, ou_int expected) {
+  ou_int x = __udivoi3(a, b);
+  if (x != expected) {
+    printf("error in __udivoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__udivoi3((ou_int)0, (ou_int)1, (ou_int)0))
+    return 1;
+  if (test__udivoi3((ou_int)1, (ou_int)1, (ou_int)1))
+    return 1;
+  if (test__udivoi3((ou_int)10, (ou_int)3, (ou_int)3))
+    return 1;
+  if (test__udivoi3((ou_int)100, (ou_int)7, (ou_int)14))
+    return 1;
+  if (test__udivoi3((ou_int)42, (ou_int)42, (ou_int)1))
+    return 1;
+  // Dividend < divisor
+  if (test__udivoi3((ou_int)3, (ou_int)10, (ou_int)0))
+    return 1;
+  // Large value in high half / small
+  // (1 << 128) / 2 = (1 << 127)
+  if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)2,
+                    make_ou(make_tu(0, 0), make_tu(0x8000000000000000ULL, 0))))
+    return 1;
+  // (1 << 128) / 3
+  if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+                    make_ou(make_tu(0, 0), make_tu(0x5555555555555555ULL,
+                                                   0x5555555555555555ULL))))
+    return 1;
+  // Large / large (same value)
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    if (test__udivoi3(big, big, (ou_int)1))
+      return 1;
+  }
+  // Large / large (double)
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+    if (test__udivoi3(dbl, big, (ou_int)2))
+      return 1;
+  }
+  // All-ones / 2
+  if (test__udivoi3(
+          (ou_int)-1, (ou_int)2,
+          make_ou(make_tu(0x7FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1))))
+    return 1;
+  // Cross-half boundary value / small
+  if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 4)), (ou_int)4,
+                    make_ou(make_tu(0, 0), make_tu(0x4000000000000000ULL, 1))))
+    return 1;
+  // Very large divisor in high half
+  {
+    ou_int big = make_ou(make_tu(1, 0), make_tu(0, 0));
+    if (test__udivoi3(big, big, (ou_int)1))
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // A / B (unsigned) = 9.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__udivoi3(
+          make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+                  make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+                  make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          (ou_int)9))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/umodoi3_test.c b/compiler-rt/test/builtins/Unit/umodoi3_test.c
new file mode 100644
index 0000000000000..db8397bc6081c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/umodoi3_test.c
@@ -0,0 +1,80 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_umodoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __umodoi3(ou_int a, ou_int b);
+
+int test__umodoi3(ou_int a, ou_int b, ou_int expected) {
+  ou_int x = __umodoi3(a, b);
+  if (x != expected) {
+    printf("error in __umodoi3\n");
+    return 1;
+  }
+  return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+  if (test__umodoi3((ou_int)0, (ou_int)1, (ou_int)0))
+    return 1;
+  if (test__umodoi3((ou_int)10, (ou_int)3, (ou_int)1))
+    return 1;
+  if (test__umodoi3((ou_int)100, (ou_int)7, (ou_int)2))
+    return 1;
+  if (test__umodoi3((ou_int)42, (ou_int)42, (ou_int)0))
+    return 1;
+  if (test__umodoi3((ou_int)3, (ou_int)10, (ou_int)3))
+    return 1;
+  // (1 << 128) % 2 = 0
+  if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)2,
+                    (ou_int)0))
+    return 1;
+  // (1 << 128) % 3 = 1
+  if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+                    (ou_int)1))
+    return 1;
+  // All-ones % 2 = 1
+  if (test__umodoi3((ou_int)-1, (ou_int)2, (ou_int)1))
+    return 1;
+  // Cross-half boundary value mod small
+  if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 5)), (ou_int)4,
+                    (ou_int)1))
+    return 1;
+  // Large mod large (same value)
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    if (test__umodoi3(big, big, (ou_int)0))
+      return 1;
+  }
+  // Large mod large (double)
+  {
+    ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+    ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+    if (test__umodoi3(dbl, big, (ou_int)0))
+      return 1;
+  }
+  // Full-width big-number test (all 4 limbs populated).
+  // A % B (unsigned), verified by Python: q*b + r == a.
+  // Expected value verified by Python arbitrary-precision arithmetic.
+  if (test__umodoi3(
+          make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+                  make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+          make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+                  make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+          make_ou(make_tu(0x11108887FFFF7776ULL, 0xEEEE6664DDDD5554ULL),
+                  make_tu(0xCCCC4443BBBB3332ULL, 0xAAAA222199A16667ULL))))
+    return 1;
+#else
+  printf("skipped\n");
+#endif
+  return 0;
+}

>From e7a72f8342d67c4a3ad069108dde284938b58bcb Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:12 +0100
Subject: [PATCH 09/17] [compiler-rt] Add __int256 UBSan support

Extend UndefinedBehaviorSanitizer to handle 256-bit integers:
- ubsan_value.h: Add 256-bit case to getIntegerBitWidth/Value
- ubsan_value.cpp: Support 256-bit integer rendering in diagnostics
- ubsan_diag.cpp: Handle 256-bit integer formatting
- Test: signed overflow detection with __int256 values

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 compiler-rt/lib/ubsan/ubsan_diag.cpp          |  8 +++-
 compiler-rt/lib/ubsan/ubsan_value.cpp         | 15 +++++++
 compiler-rt/lib/ubsan/ubsan_value.h           | 13 +++++-
 .../TestCases/Integer/int256-overflow.cpp     | 45 +++++++++++++++++++
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp

diff --git a/compiler-rt/lib/ubsan/ubsan_diag.cpp b/compiler-rt/lib/ubsan/ubsan_diag.cpp
index 2146ed3c27287..cdb2b6b489622 100644
--- a/compiler-rt/lib/ubsan/ubsan_diag.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_diag.cpp
@@ -133,7 +133,13 @@ Diag &Diag::operator<<(const Value &V) {
 
 /// Hexadecimal printing for numbers too large for Printf to handle directly.
 static void RenderHex(InternalScopedString *Buffer, UIntMax Val) {
-#if HAVE_INT128_T
+#if HAVE_INT256_T
+  Buffer->AppendF("0x%08x%08x%08x%08x%08x%08x%08x%08x",
+                  (unsigned int)(Val >> 224), (unsigned int)(Val >> 192),
+                  (unsigned int)(Val >> 160), (unsigned int)(Val >> 128),
+                  (unsigned int)(Val >> 96), (unsigned int)(Val >> 64),
+                  (unsigned int)(Val >> 32), (unsigned int)(Val));
+#elif HAVE_INT128_T
   Buffer->AppendF("0x%08x%08x%08x%08x", (unsigned int)(Val >> 96),
                   (unsigned int)(Val >> 64), (unsigned int)(Val >> 32),
                   (unsigned int)(Val));
diff --git a/compiler-rt/lib/ubsan/ubsan_value.cpp b/compiler-rt/lib/ubsan/ubsan_value.cpp
index 6e88ebaf34d4b..64ec0cc374a3e 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_value.cpp
@@ -85,6 +85,14 @@ SIntMax Value::getSIntValue() const {
 #else
   if (getType().getIntegerBitWidth() == 128)
     UNREACHABLE("libclang_rt.ubsan was built without __int128 support");
+#endif
+#if HAVE_INT256_T
+  if (getType().getIntegerBitWidth() == 256)
+    return SIntMax(UIntMax(*reinterpret_cast<s256 *>(Val)) << ExtraBits) >>
+           ExtraBits;
+#else
+  if (getType().getIntegerBitWidth() == 256)
+    UNREACHABLE("libclang_rt.ubsan was built without __int256 support");
 #endif
   UNREACHABLE("unexpected bit width");
 }
@@ -101,6 +109,13 @@ UIntMax Value::getUIntValue() const {
 #else
   if (getType().getIntegerBitWidth() == 128)
     UNREACHABLE("libclang_rt.ubsan was built without __int128 support");
+#endif
+#if HAVE_INT256_T
+  if (getType().getIntegerBitWidth() == 256)
+    return *reinterpret_cast<u256 *>(Val);
+#else
+  if (getType().getIntegerBitWidth() == 256)
+    UNREACHABLE("libclang_rt.ubsan was built without __int256 support");
 #endif
   UNREACHABLE("unexpected bit width");
 }
diff --git a/compiler-rt/lib/ubsan/ubsan_value.h b/compiler-rt/lib/ubsan/ubsan_value.h
index ee523cf5ddda5..9e3699d47f29b 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.h
+++ b/compiler-rt/lib/ubsan/ubsan_value.h
@@ -25,10 +25,21 @@ __extension__ typedef unsigned __int128 u128;
 #define HAVE_INT128_T 0
 #endif
 
+#if __SIZEOF_INT256__
+__extension__ typedef __int256_t s256;
+__extension__ typedef __uint256_t u256;
+#define HAVE_INT256_T 1
+#else
+#define HAVE_INT256_T 0
+#endif
+
 namespace __ubsan {
 
 /// \brief Largest integer types we support.
-#if HAVE_INT128_T
+#if HAVE_INT256_T
+typedef s256 SIntMax;
+typedef u256 UIntMax;
+#elif HAVE_INT128_T
 typedef s128 SIntMax;
 typedef u128 UIntMax;
 #else
diff --git a/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp b/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp
new file mode 100644
index 0000000000000..a87bbd56dd41d
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp
@@ -0,0 +1,45 @@
+// REQUIRES: int256
+//
+// RUN: %clangxx -DADD_I256 -fsanitize=signed-integer-overflow %s -o %t1 && %run %t1 2>&1 | FileCheck %s --check-prefix=CHECK-ADD_I256
+// RUN: %clangxx -DSUB_I256 -fsanitize=signed-integer-overflow %s -o %t2 && %run %t2 2>&1 | FileCheck %s --check-prefix=CHECK-SUB_I256
+// RUN: %clangxx -DNEG_I256 -fsanitize=signed-integer-overflow %s -o %t3 && %run %t3 2>&1 | FileCheck %s --check-prefix=CHECK-NEG_I256
+//
+// Test UBSan detection of signed integer overflow for __int256_t.
+
+#include <stdint.h>
+
+int main() {
+#ifdef ADD_I256
+#  if defined(__SIZEOF_INT256__)
+  // Overflow: 2^254 + 2^254 = 2^255, which exceeds __int256_t max (2^255 - 1)
+  (void)((__int256_t(1) << 254) + (__int256_t(1) << 254));
+#  else
+  // Fallback message for platforms without __int256
+  __builtin_printf("__int256 not supported\n");
+#  endif
+  // CHECK-ADD_I256: {{0x[0-9a-f]+ \+ 0x[0-9a-f]+ cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+
+#ifdef SUB_I256
+#  if defined(__SIZEOF_INT256__)
+  // Overflow: min - 1
+  __int256_t min_val = (__int256_t)1
+                       << 255; // This is the minimum (negative) value
+  (void)(min_val - 1);
+#  else
+  __builtin_printf("__int256 not supported\n");
+#  endif
+  // CHECK-SUB_I256: {{0x[0-9a-f]+ - 1 cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+
+#ifdef NEG_I256
+#  if defined(__SIZEOF_INT256__)
+  // Overflow: -min = -(-2^255) overflows because max is 2^255 - 1
+  __int256_t min_val = (__int256_t)1 << 255;
+  (void)(-min_val);
+#  else
+  __builtin_printf("__int256 not supported\n");
+#  endif
+  // CHECK-NEG_I256: {{negation of -?0x[0-9a-f]+ cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+}

>From 307e69a423384b8a347ebd77fa0f3d8eb171bf07 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:37 +0100
Subject: [PATCH 10/17] [libc++] Add __int256/__uint256 support

Wire __int256 through libc++ infrastructure (guarded by _LIBCPP_HAS_INT256):
- Type traits: is_integral, make_signed/unsigned, promote, integer_traits,
  make_32_64_128_or_256_bit, convert_to_integral
- numeric_limits<__int256_t/__uint256_t> specialization
- <charconv>: to_chars/from_chars tables and traits for 256-bit
- <format>: format_arg, format_arg_store, formatter_integer support
- <algorithm>: radix_sort key type support
- <ranges>: iota_view difference type
- <bit>: byteswap specialization
- <random>: is_valid integer trait extension
- Module map: add make_32_64_128_or_256_bit.h header

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 libcxx/include/CMakeLists.txt                 |   2 +-
 libcxx/include/__algorithm/radix_sort.h       |   7 ++
 libcxx/include/__bit/byteswap.h               |   5 +
 libcxx/include/__charconv/tables.h            | 110 ++++++++++++++++++
 libcxx/include/__charconv/to_chars_base_10.h  |  61 ++++++++++
 libcxx/include/__charconv/to_chars_integral.h |  24 +++-
 libcxx/include/__charconv/traits.h            |  24 ++++
 libcxx/include/__config                       |   6 +
 libcxx/include/__format/format_arg.h          |  71 ++++++++++-
 libcxx/include/__format/format_arg_store.h    |   8 ++
 libcxx/include/__format/format_functions.h    |  14 +++
 libcxx/include/__format/formatter_integer.h   |  20 +++-
 libcxx/include/__random/is_valid.h            |   7 ++
 libcxx/include/__ranges/iota_view.h           |   7 +-
 libcxx/include/__type_traits/integer_traits.h |   8 ++
 libcxx/include/__type_traits/is_integral.h    |   4 +
 ..._128_bit.h => make_32_64_128_or_256_bit.h} |  18 +--
 libcxx/include/__type_traits/make_signed.h    |   8 ++
 libcxx/include/__type_traits/make_unsigned.h  |   8 ++
 libcxx/include/__type_traits/promote.h        |   4 +
 .../include/__utility/convert_to_integral.h   |   6 +
 libcxx/include/limits                         |   5 +-
 libcxx/include/module.modulemap.in            |   2 +-
 23 files changed, 411 insertions(+), 18 deletions(-)
 rename libcxx/include/__type_traits/{make_32_64_or_128_bit.h => make_32_64_128_or_256_bit.h} (70%)

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 5cdf29b94e3eb..3eb049b59dbf2 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -892,7 +892,7 @@ set(files
   __type_traits/is_volatile.h
   __type_traits/is_within_lifetime.h
   __type_traits/lazy.h
-  __type_traits/make_32_64_or_128_bit.h
+  __type_traits/make_32_64_128_or_256_bit.h
   __type_traits/make_const_lvalue_ref.h
   __type_traits/make_signed.h
   __type_traits/make_transparent.h
diff --git a/libcxx/include/__algorithm/radix_sort.h b/libcxx/include/__algorithm/radix_sort.h
index 5549a69f5e220..f40adee79906a 100644
--- a/libcxx/include/__algorithm/radix_sort.h
+++ b/libcxx/include/__algorithm/radix_sort.h
@@ -334,6 +334,13 @@ struct __unsigned_integer_of_size<16> {
 };
 #  endif
 
+#  if _LIBCPP_HAS_INT256
+template <>
+struct __unsigned_integer_of_size<32> {
+  using type _LIBCPP_NODEBUG = unsigned __int256;
+};
+#  endif
+
 template <size_t _Size>
 using __unsigned_integer_of_size_t _LIBCPP_NODEBUG = typename __unsigned_integer_of_size<_Size>::type;
 
diff --git a/libcxx/include/__bit/byteswap.h b/libcxx/include/__bit/byteswap.h
index 7ce7e069b4142..326ff5dbb9cf5 100644
--- a/libcxx/include/__bit/byteswap.h
+++ b/libcxx/include/__bit/byteswap.h
@@ -41,6 +41,11 @@ template <integral _Tp>
            static_cast<_Tp>(byteswap(static_cast<uint64_t>(__val >> 64)));
 #    endif // __has_builtin(__builtin_bswap128)
 #  endif   // _LIBCPP_HAS_INT128
+#  if _LIBCPP_HAS_INT256
+  } else if constexpr (sizeof(_Tp) == 32) {
+    return (static_cast<_Tp>(byteswap(static_cast<__uint128_t>(__val))) << 128) |
+           static_cast<_Tp>(byteswap(static_cast<__uint128_t>(__val >> 128)));
+#  endif // _LIBCPP_HAS_INT256
   } else {
     static_assert(sizeof(_Tp) == 0, "byteswap is unimplemented for integral types of this size");
   }
diff --git a/libcxx/include/__charconv/tables.h b/libcxx/include/__charconv/tables.h
index b8c6fd8af0a0f..4e4aa8b4ef68c 100644
--- a/libcxx/include/__charconv/tables.h
+++ b/libcxx/include/__charconv/tables.h
@@ -138,6 +138,116 @@ inline _LIBCPP_CONSTEXPR const __uint128_t __pow10_128[40] = {
     (__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * 10};
 #  endif
 
+#if _LIBCPP_HAS_INT256
+inline _LIBCPP_CONSTEXPR const int __pow10_256_offset      = 0;
+inline _LIBCPP_CONSTEXPR const __uint256_t __pow10_256[78] = {
+    UINT64_C(0),
+    UINT64_C(10),
+    UINT64_C(100),
+    UINT64_C(1000),
+    UINT64_C(10000),
+    UINT64_C(100000),
+    UINT64_C(1000000),
+    UINT64_C(10000000),
+    UINT64_C(100000000),
+    UINT64_C(1000000000),
+    UINT64_C(10000000000),
+    UINT64_C(100000000000),
+    UINT64_C(1000000000000),
+    UINT64_C(10000000000000),
+    UINT64_C(100000000000000),
+    UINT64_C(1000000000000000),
+    UINT64_C(10000000000000000),
+    UINT64_C(100000000000000000),
+    UINT64_C(1000000000000000000),
+    UINT64_C(10000000000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000000),
+    __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * 10,
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(10000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(100000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(1000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(10000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(100000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(1000000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        UINT64_C(10000000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000),
+    __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+        __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000) * 10};
+#endif
+
 inline _LIBCPP_CONSTEXPR const char __digits_base_10[200] = {
     // clang-format off
     '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
diff --git a/libcxx/include/__charconv/to_chars_base_10.h b/libcxx/include/__charconv/to_chars_base_10.h
index d90952ea71f35..15888ac0573d2 100644
--- a/libcxx/include/__charconv/to_chars_base_10.h
+++ b/libcxx/include/__charconv/to_chars_base_10.h
@@ -175,6 +175,67 @@ __base_10_u128(char* __buffer, __uint128_t __value) _NOEXCEPT {
   return __buffer;
 }
 #  endif
+
+#if _LIBCPP_HAS_INT256
+/// \returns 10^\a exp
+///
+/// \pre \a exp [0, 77]
+_LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI inline __uint256_t __pow_10_256(int __exp) _NOEXCEPT {
+  _LIBCPP_ASSERT_INTERNAL(__exp >= __pow10_256_offset, "Index out of bounds");
+  return __pow10_256[__exp - __pow10_256_offset];
+}
+
+_LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI inline char*
+__base_10_u256(char* __buffer, __uint256_t __value) _NOEXCEPT {
+  _LIBCPP_ASSERT_INTERNAL(
+      __value > numeric_limits<__uint128_t>::max(), "The optimizations for this algorithm fail when this isn't true.");
+
+  // Maximum unsigned values:
+  // 128 bit                  340'282'366'920'938'463'463'374'607'431'768'211'455 (39 digits)
+  // 256 bit  115'792'089'237'316'195'423'570'985'008'687'907'853'
+  //          269'984'665'640'564'039'457'584'007'913'129'639'935   (78 digits)
+  //
+  // Strategy: divide into chunks of 19 digits (10^19 fits in uint64_t).
+  // A 256-bit number has at most 78 digits = 4 chunks of 19 + 2 leading digits.
+  // We peel off 19-digit chunks from the bottom using 256-bit division by 10^19.
+
+  __uint256_t __p19 = __pow_10_256(19);
+
+  // A 256-bit number has at most 78 digits = 5 chunks of up to 19 digits.
+  // Extract 5 chunks of at most 19 digits each from the bottom.
+  uint64_t __c0 = static_cast<uint64_t>(__value % __p19);
+  __value /= __p19;
+  uint64_t __c1 = static_cast<uint64_t>(__value % __p19);
+  __value /= __p19;
+  uint64_t __c2 = static_cast<uint64_t>(__value % __p19);
+  __value /= __p19;
+  uint64_t __c3 = static_cast<uint64_t>(__value % __p19);
+  __value /= __p19;
+  uint64_t __c4 = static_cast<uint64_t>(__value); // at most 2 digits
+
+  // Emit 19-digit zero-padded chunk: [9 digits] + [10 digits]
+  auto __emit_padded = [&](uint64_t __c) {
+    __buffer = __itoa::__append9(__buffer, static_cast<uint32_t>(__c / 10000000000));
+    __buffer = __itoa::__append10(__buffer, __c % 10000000000);
+  };
+
+  // Find the first non-zero chunk and emit it with variable width.
+  if (__c4) {
+    __buffer = __base_10_u64(__buffer, __c4);
+    __emit_padded(__c3);
+    __emit_padded(__c2);
+  } else if (__c3) {
+    __buffer = __base_10_u64(__buffer, __c3);
+    __emit_padded(__c2);
+  } else {
+    __buffer = __base_10_u64(__buffer, __c2);
+  }
+  __emit_padded(__c1);
+  __emit_padded(__c0);
+
+  return __buffer;
+}
+#endif
 } // namespace __itoa
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h
index 6d425139260b6..66680b2f1be4d 100644
--- a/libcxx/include/__charconv/to_chars_integral.h
+++ b/libcxx/include/__charconv/to_chars_integral.h
@@ -25,7 +25,7 @@
 #include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_signed.h>
-#include <__type_traits/make_32_64_or_128_bit.h>
+#include <__type_traits/make_32_64_128_or_256_bit.h>
 #include <__type_traits/make_unsigned.h>
 #include <__utility/unreachable.h>
 #include <cstdint>
@@ -89,6 +89,24 @@ __to_chars_itoa(char* __first, char* __last, __uint128_t __value, false_type) {
 }
 #  endif
 
+#if _LIBCPP_HAS_INT256
+template <>
+inline _LIBCPP_CONSTEXPR_SINCE_CXX23
+_LIBCPP_HIDE_FROM_ABI __to_chars_result __to_chars_itoa(char* __first, char* __last, __uint256_t __value, false_type) {
+  // When the value fits in 128-bits use the 128-bit code path.
+  if (__value <= numeric_limits<__uint128_t>::max())
+    return __to_chars_itoa(__first, __last, static_cast<__uint128_t>(__value), false_type());
+
+  using __tx  = __itoa::__traits<__uint256_t>;
+  auto __diff = __last - __first;
+
+  if (__tx::digits <= __diff || __tx::__width(__value) <= __diff)
+    return {__tx::__convert(__first, __value), errc(0)};
+  else
+    return {__last, errc::value_too_large};
+}
+#endif
+
 template <class _Tp, __enable_if_t<!is_signed<_Tp>::value, int> = 0>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI __to_chars_result
 __to_chars_integral(char* __first, char* __last, _Tp __value, int __base);
@@ -321,7 +339,7 @@ to_chars_result to_chars(char*, char*, bool, int = 10) = delete;
 template <typename _Tp, __enable_if_t<is_integral<_Tp>::value, int> = 0>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI to_chars_result
 to_chars(char* __first, char* __last, _Tp __value) {
-  using _Type = __make_32_64_or_128_bit_t<_Tp>;
+  using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
   static_assert(!is_same<_Type, void>::value, "unsupported integral type used in to_chars");
   return std::__to_chars_itoa(__first, __last, static_cast<_Type>(__value), is_signed<_Tp>());
 }
@@ -331,7 +349,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI to_chars_result
 to_chars(char* __first, char* __last, _Tp __value, int __base) {
   _LIBCPP_ASSERT_UNCATEGORIZED(2 <= __base && __base <= 36, "base not in [2, 36]");
 
-  using _Type = __make_32_64_or_128_bit_t<_Tp>;
+  using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
   return std::__to_chars_integral(__first, __last, static_cast<_Type>(__value), __base);
 }
 
diff --git a/libcxx/include/__charconv/traits.h b/libcxx/include/__charconv/traits.h
index b8c840d1ebe32..46691ae70db89 100644
--- a/libcxx/include/__charconv/traits.h
+++ b/libcxx/include/__charconv/traits.h
@@ -113,6 +113,30 @@ struct _LIBCPP_HIDDEN __traits_base<_Tp, __enable_if_t<sizeof(_Tp) == sizeof(__u
 };
 #  endif
 
+#if _LIBCPP_HAS_INT256
+template <typename _Tp>
+struct _LIBCPP_HIDDEN __traits_base<_Tp, __enable_if_t<sizeof(_Tp) == sizeof(__uint256_t)> > {
+  using type = __uint256_t;
+
+  static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI int __width(_Tp __v) {
+    _LIBCPP_ASSERT_INTERNAL(
+        __v > numeric_limits<__uint128_t>::max(), "The optimizations for this algorithm fail when this isn't true.");
+    // There's always a bit set in the upper 128-bits.
+    auto __t = (256 - std::__countl_zero(static_cast<__uint128_t>(__v >> 128))) * 1233 >> 12;
+    _LIBCPP_ASSERT_INTERNAL(__t >= __itoa::__pow10_256_offset, "Index out of bounds");
+    return __t - (__v < __itoa::__pow10_256[__t - __itoa::__pow10_256_offset]) + 1;
+  }
+
+  static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI char* __convert(char* __p, _Tp __v) {
+    return __itoa::__base_10_u256(__p, __v);
+  }
+
+  static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI decltype(__pow10_256)& __pow() {
+    return __itoa::__pow10_256;
+  }
+};
+#endif
+
 template <typename _Tp, typename _Up>
 _LIBCPP_HIDE_FROM_ABI bool _LIBCPP_CONSTEXPR_SINCE_CXX23 __mul_overflowed(_Tp __a, _Up __b, _Tp& __r) {
   static_assert(is_unsigned<_Tp>::value);
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9cb98bbb59341..ab91ec999d2aa 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -268,6 +268,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_HAS_INT128 1
 #  endif
 
+#  if !defined(__SIZEOF_INT256__) || defined(_MSC_VER)
+#    define _LIBCPP_HAS_INT256 0
+#  else
+#    define _LIBCPP_HAS_INT256 1
+#  endif
+
 #  ifdef _LIBCPP_CXX03_LANG
 #    define _LIBCPP_DECLARE_STRONG_ENUM(x)                                                                             \
       struct _LIBCPP_EXPORTED_FROM_ABI x {                                                                             \
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index 19794f0f084ce..3996185029080 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -42,8 +42,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __format {
 /// The type stored in @ref basic_format_arg.
 ///
-/// @note The 128-bit types are unconditionally in the list to avoid the values
-/// of the enums to depend on the availability of 128-bit integers.
+/// @note The 128-bit and 256-bit types are unconditionally in the list to avoid
+/// the values of the enums to depend on the availability of extended integers.
 ///
 /// @note The value is stored as a 5-bit value in the __packed_arg_t_bits. This
 /// limits the maximum number of elements to 32.
@@ -65,9 +65,11 @@ enum class __arg_t : uint8_t {
   __int,
   __long_long,
   __i128, // extension
+  __i256, // extension
   __unsigned,
   __unsigned_long_long,
   __u128, // extension
+  __u256, // extension
   __float,
   __double,
   __long_double,
@@ -118,6 +120,12 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
     return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
 #  else
     __libcpp_unreachable();
+#  endif
+  case __format::__arg_t::__i256:
+#  if _LIBCPP_HAS_INT256
+    return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__i256_);
+#  else
+    __libcpp_unreachable();
 #  endif
   case __format::__arg_t::__unsigned:
     return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
@@ -128,6 +136,12 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
     return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
 #  else
     __libcpp_unreachable();
+#  endif
+  case __format::__arg_t::__u256:
+#  if _LIBCPP_HAS_INT256
+    return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__u256_);
+#  else
+    __libcpp_unreachable();
 #  endif
   case __format::__arg_t::__float:
     return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
@@ -169,6 +183,12 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
     return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
 #    else
     __libcpp_unreachable();
+#    endif
+  case __format::__arg_t::__i256:
+#    if _LIBCPP_HAS_INT256
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i256_);
+#    else
+    __libcpp_unreachable();
 #    endif
   case __format::__arg_t::__unsigned:
     return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
@@ -179,6 +199,12 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
     return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
 #    else
     __libcpp_unreachable();
+#    endif
+  case __format::__arg_t::__u256:
+#    if _LIBCPP_HAS_INT256
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u256_);
+#    else
+    __libcpp_unreachable();
 #    endif
   case __format::__arg_t::__float:
     return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
@@ -241,6 +267,10 @@ class __basic_format_arg_value {
 #  if _LIBCPP_HAS_INT128
     __int128_t __i128_;
     __uint128_t __u128_;
+#  endif
+#  if _LIBCPP_HAS_INT256
+    __int256_t __i256_;
+    __uint256_t __u256_;
 #  endif
     float __float_;
     double __double_;
@@ -265,6 +295,10 @@ class __basic_format_arg_value {
 #  if _LIBCPP_HAS_INT128
   _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__int128_t __value) noexcept : __i128_(__value) {}
   _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__uint128_t __value) noexcept : __u128_(__value) {}
+#  endif
+#  if _LIBCPP_HAS_INT256
+  _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__int256_t __value) noexcept : __i256_(__value) {}
+  _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__uint256_t __value) noexcept : __u256_(__value) {}
 #  endif
   _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(float __value) noexcept : __float_(__value) {}
   _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(double __value) noexcept : __double_(__value) {}
@@ -302,6 +336,17 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
       typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
       return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
     }
+#    endif
+#    if _LIBCPP_HAS_INT256
+    case __format::__arg_t::__i256: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+
+    case __format::__arg_t::__u256: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
 #    endif
     default:
       return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
@@ -323,6 +368,17 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
       typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
       return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
     }
+#    endif
+#    if _LIBCPP_HAS_INT256
+    case __format::__arg_t::__i256: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+
+    case __format::__arg_t::__u256: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
 #    endif
     default:
       return std::__visit_format_arg<_Rp>(std::forward<_Visitor>(__vis), __arg);
@@ -385,6 +441,17 @@ visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
     return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
   }
 #  endif // _LIBCPP_HAS_INT128
+#  if _LIBCPP_HAS_INT256
+  case __format::__arg_t::__i256: {
+    typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+    return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+  }
+
+  case __format::__arg_t::__u256: {
+    typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+    return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+  }
+#  endif // _LIBCPP_HAS_INT256
   default:
     return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
   }
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index fbb4cad21b232..cab2b9736564b 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -74,6 +74,10 @@ consteval __arg_t __determine_arg_t() {
 #  if _LIBCPP_HAS_INT128
   else if constexpr (sizeof(_Tp) == sizeof(__int128_t))
     return __arg_t::__i128;
+#  endif
+#  if _LIBCPP_HAS_INT256
+  else if constexpr (sizeof(_Tp) == sizeof(__int256_t))
+    return __arg_t::__i256;
 #  endif
   else
     static_assert(sizeof(_Tp) == 0, "an unsupported signed integer was used");
@@ -89,6 +93,10 @@ consteval __arg_t __determine_arg_t() {
 #  if _LIBCPP_HAS_INT128
   else if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
     return __arg_t::__u128;
+#  endif
+#  if _LIBCPP_HAS_INT256
+  else if constexpr (sizeof(_Tp) == sizeof(__uint256_t))
+    return __arg_t::__u256;
 #  endif
   else
     static_assert(sizeof(_Tp) == 0, "an unsupported unsigned integer was used");
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 873265bc17c24..cf79838b50c2b 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -212,6 +212,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __compile_time_visit_format_arg(
     return __format::__compile_time_validate_argument<_CharT, __int128_t>(__parse_ctx, __ctx);
 #  else
     std::__throw_format_error("Invalid argument");
+#  endif
+    return;
+  case __arg_t::__i256:
+#  if _LIBCPP_HAS_INT256
+    return __format::__compile_time_validate_argument<_CharT, __int256_t>(__parse_ctx, __ctx);
+#  else
+    std::__throw_format_error("Invalid argument");
 #  endif
     return;
   case __arg_t::__unsigned:
@@ -223,6 +230,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __compile_time_visit_format_arg(
     return __format::__compile_time_validate_argument<_CharT, __uint128_t>(__parse_ctx, __ctx);
 #  else
     std::__throw_format_error("Invalid argument");
+#  endif
+    return;
+  case __arg_t::__u256:
+#  if _LIBCPP_HAS_INT256
+    return __format::__compile_time_validate_argument<_CharT, __uint256_t>(__parse_ctx, __ctx);
+#  else
+    std::__throw_format_error("Invalid argument");
 #  endif
     return;
   case __arg_t::__float:
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index cf186c64e3d0f..b39194195fe2d 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -19,7 +19,7 @@
 #include <__format/formatter_output.h>
 #include <__format/parser_std_format_spec.h>
 #include <__type_traits/is_void.h>
-#include <__type_traits/make_32_64_or_128_bit.h>
+#include <__type_traits/make_32_64_128_or_256_bit.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -46,7 +46,7 @@ struct __formatter_integer {
     if (__specs.__std_.__type_ == __format_spec::__type::__char)
       return __formatter::__format_char(__value, __ctx.out(), __specs);
 
-    using _Type = __make_32_64_or_128_bit_t<_Tp>;
+    using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
     static_assert(!is_void<_Type>::value, "unsupported integral type used in __formatter_integer::__format");
 
     // Reduce the number of instantiation of the integer formatter
@@ -71,6 +71,10 @@ struct formatter<long long, _CharT> : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
 struct formatter<__int128_t, _CharT> : public __formatter_integer<_CharT> {};
 #  endif
+#  if _LIBCPP_HAS_INT256
+template <__fmt_char_type _CharT>
+struct formatter<__int256_t, _CharT> : public __formatter_integer<_CharT> {};
+#  endif
 
 // Unsigned integral types.
 template <__fmt_char_type _CharT>
@@ -87,6 +91,10 @@ struct formatter<unsigned long long, _CharT> : public __formatter_integer<_CharT
 template <__fmt_char_type _CharT>
 struct formatter<__uint128_t, _CharT> : public __formatter_integer<_CharT> {};
 #  endif
+#  if _LIBCPP_HAS_INT256
+template <__fmt_char_type _CharT>
+struct formatter<__uint256_t, _CharT> : public __formatter_integer<_CharT> {};
+#  endif
 
 #  if _LIBCPP_STD_VER >= 23
 template <>
@@ -103,6 +111,10 @@ inline constexpr bool enable_nonlocking_formatter_optimization<long long> = true
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<__int128_t> = true;
 #    endif
+#    if _LIBCPP_HAS_INT256
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__int256_t> = true;
+#    endif
 
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<unsigned char> = true;
@@ -118,6 +130,10 @@ inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long lon
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true;
 #    endif
+#    if _LIBCPP_HAS_INT256
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__uint256_t> = true;
+#    endif
 #  endif // _LIBCPP_STD_VER >= 23
 #endif   // _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__random/is_valid.h b/libcxx/include/__random/is_valid.h
index f6679b3fdc427..af8be6ddc5ad6 100644
--- a/libcxx/include/__random/is_valid.h
+++ b/libcxx/include/__random/is_valid.h
@@ -73,6 +73,13 @@ template <>
 struct __libcpp_random_is_valid_inttype<__uint128_t> : true_type {}; // extension
 #endif                                                               // _LIBCPP_HAS_INT128
 
+#if _LIBCPP_HAS_INT256
+template <>
+struct __libcpp_random_is_valid_inttype<__int256_t> : true_type {}; // extension
+template <>
+struct __libcpp_random_is_valid_inttype<__uint256_t> : true_type {}; // extension
+#endif                                                               // _LIBCPP_HAS_INT256
+
 // [rand.req.urng]/3:
 // A class G meets the uniform random bit generator requirements if G models
 // uniform_random_bit_generator, invoke_result_t<G&> is an unsigned integer type,
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 6b2576ec6b23d..83a14883c485d 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -60,7 +60,12 @@ struct __get_wider_signed {
       return type_identity<long>{};
     else if constexpr (sizeof(_Int) < sizeof(long long))
       return type_identity<long long>{};
-#  if _LIBCPP_HAS_INT128
+#  if _LIBCPP_HAS_INT256
+    else if constexpr (sizeof(_Int) < sizeof(__int128))
+      return type_identity<__int128>{};
+    else if constexpr (sizeof(_Int) <= sizeof(__int256))
+      return type_identity<__int256>{};
+#  elif _LIBCPP_HAS_INT128
     else if constexpr (sizeof(_Int) <= sizeof(__int128))
       return type_identity<__int128>{};
 #  else
diff --git a/libcxx/include/__type_traits/integer_traits.h b/libcxx/include/__type_traits/integer_traits.h
index fad502c44e301..c0d1937e2935e 100644
--- a/libcxx/include/__type_traits/integer_traits.h
+++ b/libcxx/include/__type_traits/integer_traits.h
@@ -35,6 +35,10 @@ inline const bool __is_signed_integer_v<signed long long> = true;
 template <>
 inline const bool __is_signed_integer_v<__int128_t> = true;
 #endif
+#if _LIBCPP_HAS_INT256
+template <>
+inline const bool __is_signed_integer_v<__int256_t> = true;
+#endif
 
 // This trait is to determine whether a type is an /unsigned integer type/
 // See [basic.fundamental]/p2
@@ -54,6 +58,10 @@ inline const bool __is_unsigned_integer_v<unsigned long long> = true;
 template <>
 inline const bool __is_unsigned_integer_v<__uint128_t> = true;
 #endif
+#if _LIBCPP_HAS_INT256
+template <>
+inline const bool __is_unsigned_integer_v<__uint256_t> = true;
+#endif
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/is_integral.h b/libcxx/include/__type_traits/is_integral.h
index 5a340965f0384..8f982842d54bb 100644
--- a/libcxx/include/__type_traits/is_integral.h
+++ b/libcxx/include/__type_traits/is_integral.h
@@ -57,6 +57,10 @@ template <>          struct __libcpp_is_integral<unsigned long long> { enum { va
 template <>          struct __libcpp_is_integral<__int128_t>         { enum { value = 1 }; };
 template <>          struct __libcpp_is_integral<__uint128_t>        { enum { value = 1 }; };
 #endif
+#if _LIBCPP_HAS_INT256
+template <>          struct __libcpp_is_integral<__int256_t>         { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<__uint256_t>        { enum { value = 1 }; };
+#endif
 // clang-format on
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_32_64_or_128_bit.h b/libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
similarity index 70%
rename from libcxx/include/__type_traits/make_32_64_or_128_bit.h
rename to libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
index 7016209ec9c0a..f4de69017033c 100644
--- a/libcxx/include/__type_traits/make_32_64_or_128_bit.h
+++ b/libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
-#define _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
+#ifndef _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
+#define _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
 
 #include <__config>
 #include <__type_traits/conditional.h>
@@ -23,19 +23,23 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-/// Helper to promote an integral to smallest 32, 64, or 128 bit representation.
+/// Helper to promote an integral to smallest 32, 64, 128, or 256 bit representation.
 ///
-/// The restriction is the same as the integral version of to_char.
+/// The restriction is the same as the integral version of to_chars.
 template <class _Tp>
 #if _LIBCPP_STD_VER >= 20
   requires(is_signed_v<_Tp> || is_unsigned_v<_Tp> || is_same_v<_Tp, char>)
 #endif
 // clang-format off
-using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
+using __make_32_64_128_or_256_bit_t _LIBCPP_NODEBUG =
     __copy_unsigned_t<_Tp,
         __conditional_t<sizeof(_Tp) <= sizeof(int32_t),    int32_t,
         __conditional_t<sizeof(_Tp) <= sizeof(int64_t),    int64_t,
-#if _LIBCPP_HAS_INT128
+#if _LIBCPP_HAS_INT256
+        __conditional_t<sizeof(_Tp) <= sizeof(__int128_t), __int128_t,
+        __conditional_t<sizeof(_Tp) <= sizeof(__int256_t), __int256_t,
+        /* else */                                         void> >
+#elif _LIBCPP_HAS_INT128
         __conditional_t<sizeof(_Tp) <= sizeof(__int128_t), __int128_t,
         /* else */                                         void>
 #else
@@ -46,4 +50,4 @@ using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
+#endif // _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index dff23d880dc30..794de061403bd 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -37,6 +37,10 @@ using __signed_types =
 #  if _LIBCPP_HAS_INT128
                 ,
                 __int128_t
+#  endif
+#  if _LIBCPP_HAS_INT256
+                ,
+                __int256_t
 #  endif
                 >;
 
@@ -62,6 +66,10 @@ template <> struct __make_signed<unsigned long long, true> {typedef long long ty
 template <> struct __make_signed<__int128_t,         true> {typedef __int128_t type;};
 template <> struct __make_signed<__uint128_t,        true> {typedef __int128_t type;};
 #  endif
+#  if _LIBCPP_HAS_INT256
+template <> struct __make_signed<__int256_t,         true> {typedef __int256_t type;};
+template <> struct __make_signed<__uint256_t,        true> {typedef __int256_t type;};
+#  endif
 // clang-format on
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index a83baa658e294..a8bb71bfc8314 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -39,6 +39,10 @@ using __unsigned_types =
 #  if _LIBCPP_HAS_INT128
                 ,
                 __uint128_t
+#  endif
+#  if _LIBCPP_HAS_INT256
+                ,
+                __uint256_t
 #  endif
                 >;
 
@@ -64,6 +68,10 @@ template <> struct __make_unsigned<unsigned long long, true> {typedef unsigned l
 template <> struct __make_unsigned<__int128_t,         true> {typedef __uint128_t        type;};
 template <> struct __make_unsigned<__uint128_t,        true> {typedef __uint128_t        type;};
 #  endif
+#  if _LIBCPP_HAS_INT256
+template <> struct __make_unsigned<__int256_t,         true> {typedef __uint256_t        type;};
+template <> struct __make_unsigned<__uint256_t,        true> {typedef __uint256_t        type;};
+#  endif
 // clang-format on
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/promote.h b/libcxx/include/__type_traits/promote.h
index 96b4903032b18..94a535a3d0d04 100644
--- a/libcxx/include/__type_traits/promote.h
+++ b/libcxx/include/__type_traits/promote.h
@@ -31,6 +31,10 @@ double __promote_impl(unsigned long long);
 double __promote_impl(__int128_t);
 double __promote_impl(__uint128_t);
 #endif
+#if _LIBCPP_HAS_INT256
+double __promote_impl(__int256_t);
+double __promote_impl(__uint256_t);
+#endif
 double __promote_impl(double);
 long double __promote_impl(long double);
 
diff --git a/libcxx/include/__utility/convert_to_integral.h b/libcxx/include/__utility/convert_to_integral.h
index c8149b7744984..802f9e1c5a0ec 100644
--- a/libcxx/include/__utility/convert_to_integral.h
+++ b/libcxx/include/__utility/convert_to_integral.h
@@ -48,6 +48,12 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __int128_t __convert_to_integral(
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __uint128_t __convert_to_integral(__uint128_t __val) { return __val; }
 #endif
 
+#if _LIBCPP_HAS_INT256
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __int256_t __convert_to_integral(__int256_t __val) { return __val; }
+
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __uint256_t __convert_to_integral(__uint256_t __val) { return __val; }
+#endif
+
 template <class _Tp, bool = is_enum<_Tp>::value>
 struct __sfinae_underlying_type {
   using type = __underlying_type_t<_Tp>;
diff --git a/libcxx/include/limits b/libcxx/include/limits
index ff40d2051d06f..0e34bb49220ea 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -186,7 +186,10 @@ protected:
 
   static _LIBCPP_CONSTEXPR const bool is_signed   = type(-1) < type(0);
   static _LIBCPP_CONSTEXPR const int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
-  static _LIBCPP_CONSTEXPR const int digits10     = digits * 3 / 10;
+  // floor(digits * log10(2)); 301/1000 approximates log10(2) = 0.30103...
+  // more accurately than 3/10 = 0.3, which under-counts at 256+ bits.
+  // Exact for all bit widths up to at least 33000 (beyond any practical use).
+  static _LIBCPP_CONSTEXPR const int digits10     = digits * 301 / 1000;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
   static _LIBCPP_CONSTEXPR const type __min       = is_signed ? _Tp(_Tp(1) << digits) : 0;
   static _LIBCPP_CONSTEXPR const type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 9012ed18cbd79..9a779c7c28f94 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -343,7 +343,7 @@ module std_core [system] {
     }
     module is_within_lifetime                         { header "__type_traits/is_within_lifetime.h" }
     module lazy                                       { header "__type_traits/lazy.h" }
-    module make_32_64_or_128_bit                      { header "__type_traits/make_32_64_or_128_bit.h" }
+    module make_32_64_128_or_256_bit                   { header "__type_traits/make_32_64_128_or_256_bit.h" }
     module make_const_lvalue_ref                      { header "__type_traits/make_const_lvalue_ref.h" }
     module make_signed                                { header "__type_traits/make_signed.h" }
     module make_transparent                           { header "__type_traits/make_transparent.h" }

>From d8baaf3f131d46005e5ccf51515d89e8531115ef Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:53 +0100
Subject: [PATCH 11/17] [libc++][test] Add __int256 tests

Add libc++ test coverage for __int256 support:
- Type traits: is_integral, is_signed/unsigned, make_signed/unsigned,
  integer concepts (__libcpp_integer, __libcpp_signed/unsigned_integer),
  is_always_bitcastable
- numeric_limits: all members (min, max, digits, digits10, etc.)
- charconv: to_chars/from_chars for various bases
- format: format_arg type enum, integer formatting
- bit operations: byteswap, countl_zero, countr_zero, popcount
- Containers: hash<__int256_t> and hash<__uint256_t>
- convert_to_integral
- Update test_macros.h with TEST_HAS_INT256 feature macro

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 .../__libcpp_integer.compile.pass.cpp         |   6 +
 .../__libcpp_signed_integer.compile.pass.cpp  |   6 +
 ...__libcpp_unsigned_integer.compile.pass.cpp |   6 +
 .../containers/unord/hash/int256.pass.cpp     | 136 ++++++++++
 .../numerics/bit.ops/int256.byteswap.pass.cpp |  56 ++++
 .../libcxx/numerics/bit.ops/int256.pass.cpp   |  99 ++++++++
 .../charconv/int256.from_chars.pass.cpp       | 240 ++++++++++++++++++
 .../libcxx/numerics/charconv/int256.pass.cpp  |  87 +++++++
 .../numerics/numeric.limits/int256.pass.cpp   | 211 +++++++++++++++
 .../type_traits/convert_to_integral.pass.cpp  |   4 +
 .../test/libcxx/type_traits/int256.pass.cpp   |  93 +++++++
 .../is_always_bitcastable.compile.pass.cpp    |   5 +
 .../format.arg/arg_t.compile.pass.cpp         |  24 +-
 .../format/format.arguments/int256.pass.cpp   | 119 +++++++++
 .../format.context/types.compile.pass.cpp     |   6 +
 libcxx/test/support/test_macros.h             |   8 +
 16 files changed, 1096 insertions(+), 10 deletions(-)
 create mode 100644 libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
 create mode 100644 libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
 create mode 100644 libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
 create mode 100644 libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
 create mode 100644 libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
 create mode 100644 libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
 create mode 100644 libcxx/test/libcxx/type_traits/int256.pass.cpp
 create mode 100644 libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp

diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
index 4958a258137a1..b0380e3d9368f 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(std::__signed_or_unsigned_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(std::__signed_or_unsigned_integer<__uint128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_or_unsigned_integer<__uint256_t>);
+#endif
 // Signed
 static_assert(std::__signed_or_unsigned_integer<signed char>);
 static_assert(std::__signed_or_unsigned_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(std::__signed_or_unsigned_integer<short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(std::__signed_or_unsigned_integer<__int128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_or_unsigned_integer<__int256_t>);
+#endif
 // Non-integer
 static_assert(!std::__signed_or_unsigned_integer<bool>);
 static_assert(!std::__signed_or_unsigned_integer<char>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
index 3fa342685770c..8296b1abe4658 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(!std::__signed_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(!std::__signed_integer<__uint128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(!std::__signed_integer<__uint256_t>);
+#endif
 // Signed
 static_assert(std::__signed_integer<signed char>);
 static_assert(std::__signed_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(std::__signed_integer<short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(std::__signed_integer<__int128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_integer<__int256_t>);
+#endif
 // Non-integer
 static_assert(!std::__signed_integer<bool>);
 static_assert(!std::__signed_integer<char>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
index ff60f32319171..48dcceb1f8924 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(std::__unsigned_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(std::__unsigned_integer<__uint128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__unsigned_integer<__uint256_t>);
+#endif
 // Signed
 static_assert(!std::__unsigned_integer<signed char>);
 static_assert(!std::__unsigned_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(!std::__unsigned_integer<short int>);
 #if _LIBCPP_HAS_INT128
 static_assert(!std::__unsigned_integer<__int128_t>);
 #endif
+#if _LIBCPP_HAS_INT256
+static_assert(!std::__unsigned_integer<__int256_t>);
+#endif
 // Non-integer
 static_assert(!std::__unsigned_integer<bool>);
 static_assert(!std::__unsigned_integer<char>);
diff --git a/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp b/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
new file mode 100644
index 0000000000000..8c0a039535eb6
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test std::hash specialization for __int256_t / __uint256_t.
+//
+// The generic __hash_impl dispatches to __scalar_hash<_Tp, N> where
+// N = sizeof(_Tp) / sizeof(size_t). For __int256_t on 64-bit platforms,
+// N = 32/8 = 4, using __scalar_hash<_Tp, 4> which hashes via __hash_memory.
+
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+  std::hash<__int256_t> h_s;
+  std::hash<__uint256_t> h_u;
+
+  // --- Basic consistency: same input always gives same output ---
+  {
+    __int256_t a = 42;
+    if (h_s(a) != h_s(a))
+      return 1;
+
+    __uint256_t b = 42;
+    if (h_u(b) != h_u(b))
+      return 2;
+  }
+
+  // --- Different values should (usually) give different hashes ---
+  {
+    __uint256_t a = 0;
+    __uint256_t b = 1;
+    __uint256_t c = (__uint256_t)1 << 128;
+    __uint256_t d = (__uint256_t)1 << 255;
+
+    // We can't guarantee different hashes for all pairs (pigeonhole),
+    // but for these carefully chosen values it's astronomically unlikely
+    // that all four hash to the same value.
+    size_t ha = h_u(a);
+    size_t hb = h_u(b);
+    size_t hc = h_u(c);
+    size_t hd = h_u(d);
+
+    // At least 2 of the 4 hashes should be distinct
+    int distinct = 1;
+    if (hb != ha)
+      ++distinct;
+    if (hc != ha && hc != hb)
+      ++distinct;
+    if (hd != ha && hd != hb && hd != hc)
+      ++distinct;
+    if (distinct < 2)
+      return 3;
+  }
+
+  // --- Zero and negative values ---
+  {
+    __int256_t zero = 0;
+    __int256_t neg  = -1;
+    // Hash of 0 and -1 should differ (very high probability)
+    if (h_s(zero) == h_s(neg)) {
+      // Allow this in theory, but verify the hash function is callable
+      (void)h_s(zero);
+    }
+  }
+
+  // --- Large values near max ---
+  {
+    __uint256_t max_val     = ~(__uint256_t)0;
+    __uint256_t max_minus_1 = max_val - 1;
+    // These should produce valid hash values (no crash)
+    size_t h1 = h_u(max_val);
+    size_t h2 = h_u(max_minus_1);
+    (void)h1;
+    (void)h2;
+  }
+
+  // --- std::unordered_set with __uint256_t keys ---
+  {
+    std::unordered_set<__uint256_t> s;
+    s.insert(0);
+    s.insert(1);
+    s.insert((__uint256_t)1 << 128);
+    s.insert(~(__uint256_t)0);
+
+    if (s.size() != 4)
+      return 4;
+    if (s.count(0) != 1)
+      return 5;
+    if (s.count(1) != 1)
+      return 6;
+    if (s.count(2) != 0)
+      return 7;
+  }
+
+  // --- std::unordered_map with __int256_t keys ---
+  {
+    std::unordered_map<__int256_t, int> m;
+    m[0]                    = 10;
+    m[-1]                   = 20;
+    m[(__int256_t)1 << 200] = 30;
+
+    if (m.size() != 3)
+      return 8;
+    if (m[0] != 10)
+      return 9;
+    if (m[-1] != 20)
+      return 10;
+  }
+
+  // --- Signed and unsigned hash independence ---
+  // hash<__int256_t>(42) and hash<__uint256_t>(42) may or may not be equal
+  // (implementation defined), but both must be callable
+  {
+    __int256_t sv  = 42;
+    __uint256_t uv = 42;
+    (void)h_s(sv);
+    (void)h_u(uv);
+  }
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp b/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
new file mode 100644
index 0000000000000..fc6e5957bf83d
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// Test std::byteswap with __int256_t and __uint256_t
+
+#include <bit>
+#include <cassert>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+constexpr __uint256_t
+make256(unsigned long long h3, unsigned long long h2, unsigned long long h1, unsigned long long h0) {
+  __uint256_t v = (__uint256_t)h3;
+  v             = (v << 64) | (__uint256_t)h2;
+  v             = (v << 64) | (__uint256_t)h1;
+  v             = (v << 64) | (__uint256_t)h0;
+  return v;
+}
+
+// Constexpr tests
+static_assert(std::byteswap((__uint256_t)0) == (__uint256_t)0);
+static_assert(std::byteswap(~(__uint256_t)0) == ~(__uint256_t)0);
+
+// Known pattern: bytes 01 02 03 ... 20 reversed: 20 1F 1E ... 01
+static_assert(std::byteswap(make256(0x0102030405060708, 0x090A0B0C0D0E0F10, 0x1112131415161718, 0x191A1B1C1D1E1F20)) ==
+              make256(0x201F1E1D1C1B1A19, 0x1817161514131211, 0x100F0E0D0C0B0A09, 0x0807060504030201));
+
+// Double byteswap is identity
+static_assert(std::byteswap(std::byteswap(make256(0xDEADBEEF, 0xCAFEBABE, 0x12345678, 0x9ABCDEF0))) ==
+              make256(0xDEADBEEF, 0xCAFEBABE, 0x12345678, 0x9ABCDEF0));
+
+// Signed byteswap compiles
+static_assert(std::byteswap((__int256_t)0) == (__int256_t)0);
+
+int main(int, char**) {
+  // Runtime verification
+  __uint256_t val      = make256(0x0102030405060708, 0x090A0B0C0D0E0F10, 0x1112131415161718, 0x191A1B1C1D1E1F20);
+  __uint256_t swapped  = std::byteswap(val);
+  __uint256_t expected = make256(0x201F1E1D1C1B1A19, 0x1817161514131211, 0x100F0E0D0C0B0A09, 0x0807060504030201);
+  assert(swapped == expected);
+  assert(std::byteswap(swapped) == val);
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp b/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
new file mode 100644
index 0000000000000..f4a7de8378bea
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Test <bit> operations with __uint256_t (Tier 2 -- key for Hamming distance)
+
+#include <bit>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// std::popcount -- the core operation for Hamming distance in neural search
+static_assert(std::popcount((__uint256_t)0) == 0);
+static_assert(std::popcount((__uint256_t)1) == 1);
+static_assert(std::popcount((__uint256_t)0xFF) == 8);
+static_assert(std::popcount((__uint256_t)0xFFFFFFFFFFFFFFFF) == 64);
+
+// std::countl_zero
+static_assert(std::countl_zero((__uint256_t)0) == 256);
+static_assert(std::countl_zero((__uint256_t)1) == 255);
+
+// std::countr_zero
+static_assert(std::countr_zero((__uint256_t)0) == 256);
+static_assert(std::countr_zero((__uint256_t)1) == 0);
+static_assert(std::countr_zero((__uint256_t)2) == 1);
+
+// std::countl_one
+static_assert(std::countl_one((__uint256_t)0) == 0);
+
+// std::countr_one
+static_assert(std::countr_one((__uint256_t)0) == 0);
+static_assert(std::countr_one((__uint256_t)1) == 1);
+static_assert(std::countr_one((__uint256_t)0xFF) == 8);
+
+// std::has_single_bit
+static_assert(std::has_single_bit((__uint256_t)1));
+static_assert(std::has_single_bit((__uint256_t)2));
+static_assert(std::has_single_bit((__uint256_t)4));
+static_assert(!std::has_single_bit((__uint256_t)3));
+static_assert(!std::has_single_bit((__uint256_t)0));
+
+// std::bit_width
+static_assert(std::bit_width((__uint256_t)0) == 0);
+static_assert(std::bit_width((__uint256_t)1) == 1);
+static_assert(std::bit_width((__uint256_t)2) == 2);
+static_assert(std::bit_width((__uint256_t)255) == 8);
+
+// std::rotl / std::rotr
+static_assert(std::rotl((__uint256_t)1, 1) == 2);
+static_assert(std::rotl((__uint256_t)1, 64) == ((__uint256_t)1 << 64));
+static_assert(std::rotl((__uint256_t)1, 255) == ((__uint256_t)1 << 255));
+static_assert(std::rotr((__uint256_t)2, 1) == 1);
+static_assert(std::rotr((__uint256_t)1, 1) == ((__uint256_t)1 << 255));
+static_assert(std::rotl(std::rotr((__uint256_t)0xFF, 4), 4) == 0xFF);
+
+// std::bit_ceil
+static_assert(std::bit_ceil((__uint256_t)0) == 1);
+static_assert(std::bit_ceil((__uint256_t)1) == 1);
+static_assert(std::bit_ceil((__uint256_t)2) == 2);
+static_assert(std::bit_ceil((__uint256_t)3) == 4);
+static_assert(std::bit_ceil((__uint256_t)255) == 256);
+
+// std::bit_floor
+static_assert(std::bit_floor((__uint256_t)0) == 0);
+static_assert(std::bit_floor((__uint256_t)1) == 1);
+static_assert(std::bit_floor((__uint256_t)2) == 2);
+static_assert(std::bit_floor((__uint256_t)3) == 2);
+static_assert(std::bit_floor((__uint256_t)255) == 128);
+
+int main(int, char**) {
+  // Runtime: Hamming distance pattern (Algolia neural search style)
+  __uint256_t a = (__uint256_t)0xDEADBEEF << 128 | 0xCAFEBABE;
+  __uint256_t b = (__uint256_t)0xFEEDFACE << 128 | 0xBAADF00D;
+  int hamming   = std::popcount(a ^ b);
+  (void)hamming;
+
+  // Runtime: Verify popcount of known pattern
+  __uint256_t all_ones_low64 = 0xFFFFFFFFFFFFFFFF;
+  if (std::popcount(all_ones_low64) != 64)
+    return 1;
+
+  __uint256_t all_zeros = 0;
+  if (std::popcount(all_zeros) != 0)
+    return 2;
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp b/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
new file mode 100644
index 0000000000000..eb6b2f3ad3ced
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Requires compiler-rt __int256 builtins (__udivoi3, __umodoi3) at runtime.
+// These are not yet available in the system compiler-rt library.
+// REQUIRES: int256-runtime
+
+// Test std::from_chars support for __int256_t / __uint256_t.
+//
+// from_chars works generically for all integral types via SFINAE on
+// is_integral<_Tp>::value. The implementation uses __itoa::__traits<_Tp>
+// for the base-10 fast path, and __itoa::__mul_overflowed (via
+// __builtin_mul_overflow) for other bases. Both support __uint256_t.
+
+#include <charconv>
+#include <cstring>
+#include <limits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// Helper: round-trip through to_chars then from_chars, verify value is preserved.
+template <typename T>
+bool round_trip(T value) {
+  char buf[80];
+  auto [to_ptr, to_ec] = std::to_chars(buf, buf + sizeof(buf), value);
+  if (to_ec != std::errc{})
+    return false;
+
+  T parsed{};
+  auto [from_ptr, from_ec] = std::from_chars(buf, to_ptr, parsed);
+  if (from_ec != std::errc{})
+    return false;
+  if (from_ptr != to_ptr)
+    return false;
+  return parsed == value;
+}
+
+// Helper: round-trip with explicit base.
+template <typename T>
+bool round_trip_base(T value, int base) {
+  char buf[260]; // base-2 of 256-bit = 256 chars + sign
+  auto [to_ptr, to_ec] = std::to_chars(buf, buf + sizeof(buf), value, base);
+  if (to_ec != std::errc{})
+    return false;
+
+  T parsed{};
+  auto [from_ptr, from_ec] = std::from_chars(buf, to_ptr, parsed, base);
+  if (from_ec != std::errc{})
+    return false;
+  if (from_ptr != to_ptr)
+    return false;
+  return parsed == value;
+}
+
+int main(int, char**) {
+  // ====================================================================
+  // Basic from_chars (base 10, default)
+  // ====================================================================
+
+  // --- Parse small unsigned values ---
+  {
+    __uint256_t val;
+    const char* str = "42";
+    auto [ptr, ec]  = std::from_chars(str, str + 2, val);
+    if (ec != std::errc{} || val != 42 || ptr != str + 2)
+      return 1;
+  }
+
+  // --- Parse zero ---
+  {
+    __uint256_t val;
+    const char* str = "0";
+    auto [ptr, ec]  = std::from_chars(str, str + 1, val);
+    if (ec != std::errc{} || val != 0)
+      return 2;
+  }
+
+  // --- Parse negative signed value ---
+  {
+    __int256_t val;
+    const char* str = "-1";
+    auto [ptr, ec]  = std::from_chars(str, str + 2, val);
+    if (ec != std::errc{} || val != -1)
+      return 3;
+  }
+
+  // --- Parse value > 64-bit ---
+  {
+    __uint256_t val;
+    const char* str = "18446744073709551616"; // 2^64
+    auto [ptr, ec]  = std::from_chars(str, str + std::strlen(str), val);
+    if (ec != std::errc{} || val != ((__uint256_t)1 << 64))
+      return 4;
+  }
+
+  // --- Parse value > 128-bit ---
+  {
+    __uint256_t val;
+    // 2^128 = 340282366920938463463374607431768211456
+    const char* str = "340282366920938463463374607431768211456";
+    auto [ptr, ec]  = std::from_chars(str, str + std::strlen(str), val);
+    if (ec != std::errc{} || val != ((__uint256_t)1 << 128))
+      return 5;
+  }
+
+  // --- Invalid input ---
+  {
+    __uint256_t val = 999;
+    const char* str = "abc";
+    auto [ptr, ec]  = std::from_chars(str, str + 3, val);
+    if (ec != std::errc::invalid_argument)
+      return 6;
+    // val should be unchanged on error
+  }
+
+  // --- Leading zeros ---
+  {
+    __uint256_t val;
+    const char* str = "00042";
+    auto [ptr, ec]  = std::from_chars(str, str + 5, val);
+    if (ec != std::errc{} || val != 42)
+      return 7;
+  }
+
+  // ====================================================================
+  // Round-trip: to_chars -> from_chars for various values
+  // ====================================================================
+
+  // Unsigned values
+  if (!round_trip<__uint256_t>(0))
+    return 10;
+  if (!round_trip<__uint256_t>(1))
+    return 11;
+  if (!round_trip<__uint256_t>(42))
+    return 12;
+  if (!round_trip<__uint256_t>((__uint256_t)1 << 64))
+    return 13;
+  if (!round_trip<__uint256_t>((__uint256_t)1 << 128))
+    return 14;
+  if (!round_trip<__uint256_t>((__uint256_t)1 << 200))
+    return 15;
+  if (!round_trip<__uint256_t>(~(__uint256_t)0)) // max
+    return 16;
+
+  // Signed values
+  if (!round_trip<__int256_t>(0))
+    return 20;
+  if (!round_trip<__int256_t>(1))
+    return 21;
+  if (!round_trip<__int256_t>(-1))
+    return 22;
+  if (!round_trip<__int256_t>((__int256_t)1 << 200))
+    return 23;
+  if (!round_trip<__int256_t>(std::numeric_limits<__int256_t>::max()))
+    return 24;
+  if (!round_trip<__int256_t>(std::numeric_limits<__int256_t>::min()))
+    return 25;
+
+  // ====================================================================
+  // Non-decimal bases: hex, octal, binary
+  // ====================================================================
+
+  // --- Hex (base 16) ---
+  {
+    __uint256_t val;
+    const char* str = "ff";
+    auto [ptr, ec]  = std::from_chars(str, str + 2, val, 16);
+    if (ec != std::errc{} || val != 255)
+      return 30;
+  }
+
+  // --- Hex round-trip ---
+  if (!round_trip_base<__uint256_t>((__uint256_t)1 << 128, 16))
+    return 31;
+  if (!round_trip_base<__uint256_t>(~(__uint256_t)0, 16))
+    return 32;
+
+  // --- Octal (base 8) ---
+  {
+    __uint256_t val;
+    const char* str = "777";
+    auto [ptr, ec]  = std::from_chars(str, str + 3, val, 8);
+    if (ec != std::errc{} || val != 0777)
+      return 33;
+  }
+
+  // --- Binary (base 2) ---
+  {
+    __uint256_t val;
+    const char* str = "1010";
+    auto [ptr, ec]  = std::from_chars(str, str + 4, val, 2);
+    if (ec != std::errc{} || val != 10)
+      return 34;
+  }
+
+  // --- Base 36 ---
+  if (!round_trip_base<__uint256_t>((__uint256_t)1 << 100, 36))
+    return 35;
+
+  // ====================================================================
+  // Overflow detection
+  // ====================================================================
+
+  // --- Unsigned overflow ---
+  {
+    __uint256_t val;
+    // max uint256 + 1 in decimal: append a digit to max
+    // Use a string that's definitely too large
+    const char* str = "115792089237316195423570985008687907853"
+                      "269984665640564039457584007913129639936"; // 2^256
+    auto [ptr, ec]  = std::from_chars(str, str + std::strlen(str), val);
+    if (ec != std::errc::result_out_of_range)
+      return 40;
+  }
+
+  // --- Signed overflow (positive) ---
+  {
+    __int256_t val;
+    // max int256 + 1 = 2^255
+    const char* str = "57896044618658097711785492504343953926"
+                      "634992332820282019728792003956564819968";
+    auto [ptr, ec]  = std::from_chars(str, str + std::strlen(str), val);
+    if (ec != std::errc::result_out_of_range)
+      return 41;
+  }
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp b/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
new file mode 100644
index 0000000000000..f40af2b5f6278
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Requires compiler-rt __int256 builtins (__udivoi3, __umodoi3) at runtime.
+// These are not yet available in the system compiler-rt library.
+// REQUIRES: int256-runtime
+
+// Test to_chars support for __uint256_t / __int256_t (Tier 3)
+
+#include <charconv>
+#include <cstring>
+#include <limits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+  char buf[80]; // 78 digits max + sign + null
+
+  // to_chars: small values that fit in 64-bit
+  {
+    __uint256_t val = 42;
+    auto [ptr, ec]  = std::to_chars(buf, buf + sizeof(buf), val);
+    *ptr            = '\0';
+    if (ec != std::errc{} || std::strcmp(buf, "42") != 0)
+      return 1;
+  }
+
+  // to_chars: value that fits in 128-bit but not 64-bit
+  {
+    __uint256_t val = (__uint256_t)1 << 64;
+    auto [ptr, ec]  = std::to_chars(buf, buf + sizeof(buf), val);
+    *ptr            = '\0';
+    if (ec != std::errc{} || std::strcmp(buf, "18446744073709551616") != 0)
+      return 2;
+  }
+
+  // to_chars: value > 128-bit
+  {
+    // 2^128 = 340282366920938463463374607431768211456
+    __uint256_t val = (__uint256_t)1 << 128;
+    auto [ptr, ec]  = std::to_chars(buf, buf + sizeof(buf), val);
+    *ptr            = '\0';
+    if (ec != std::errc{} || std::strcmp(buf, "340282366920938463463374607431768211456") != 0)
+      return 3;
+  }
+
+  // to_chars: zero
+  {
+    __uint256_t val = 0;
+    auto [ptr, ec]  = std::to_chars(buf, buf + sizeof(buf), val);
+    *ptr            = '\0';
+    if (ec != std::errc{} || std::strcmp(buf, "0") != 0)
+      return 4;
+  }
+
+  // to_chars: signed negative
+  {
+    __int256_t val = -1;
+    auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+    *ptr           = '\0';
+    if (ec != std::errc{} || std::strcmp(buf, "-1") != 0)
+      return 5;
+  }
+
+  // to_chars: buffer too small
+  {
+    __uint256_t val = (__uint256_t)1 << 128;
+    char small[5];
+    auto [ptr, ec] = std::to_chars(small, small + sizeof(small), val);
+    if (ec != std::errc::value_too_large)
+      return 6;
+  }
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp b/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
new file mode 100644
index 0000000000000..f50fc5ca2be2f
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test std::numeric_limits specialization for __int256_t / __uint256_t.
+//
+// The generic __libcpp_numeric_limits<_Tp, true> template handles all
+// arithmetic types, including __int256_t and __uint256_t. This test verifies
+// that the specialization produces correct values for all properties.
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// ========================================================================
+// Static properties (compile-time)
+// ========================================================================
+
+// --- is_specialized ---
+static_assert(std::numeric_limits<__int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<const __int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<volatile __uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<const volatile __int256_t>::is_specialized, "");
+
+// --- is_signed ---
+static_assert(std::numeric_limits<__int256_t>::is_signed, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_signed, "");
+
+// --- is_integer, is_exact ---
+static_assert(std::numeric_limits<__int256_t>::is_integer, "");
+static_assert(std::numeric_limits<__uint256_t>::is_integer, "");
+static_assert(std::numeric_limits<__int256_t>::is_exact, "");
+static_assert(std::numeric_limits<__uint256_t>::is_exact, "");
+
+// --- radix ---
+static_assert(std::numeric_limits<__int256_t>::radix == 2, "");
+static_assert(std::numeric_limits<__uint256_t>::radix == 2, "");
+
+// --- digits ---
+// __int256_t: 256 bits - 1 sign bit = 255 value bits
+// __uint256_t: 256 bits, all value bits
+static_assert(std::numeric_limits<__int256_t>::digits == 255, "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 256, "");
+
+// --- digits10 ---
+// digits10 = floor(digits * log10(2))
+// For __int256_t:  floor(255 * 0.30103) = floor(76.76) = 76
+// For __uint256_t: floor(256 * 0.30103) = floor(77.06) = 77
+static_assert(std::numeric_limits<__int256_t>::digits10 == 76, "");
+static_assert(std::numeric_limits<__uint256_t>::digits10 == 77, "");
+
+// --- max_digits10 ---
+static_assert(std::numeric_limits<__int256_t>::max_digits10 == 0, "");
+static_assert(std::numeric_limits<__uint256_t>::max_digits10 == 0, "");
+
+// --- is_bounded ---
+static_assert(std::numeric_limits<__int256_t>::is_bounded, "");
+static_assert(std::numeric_limits<__uint256_t>::is_bounded, "");
+
+// --- is_modulo ---
+// Signed: not modulo (overflow is UB). Unsigned: modulo (wraps).
+static_assert(!std::numeric_limits<__int256_t>::is_modulo, "");
+static_assert(std::numeric_limits<__uint256_t>::is_modulo, "");
+
+// --- has_infinity, has_quiet_NaN, etc. ---
+static_assert(!std::numeric_limits<__int256_t>::has_infinity, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_infinity, "");
+static_assert(!std::numeric_limits<__int256_t>::has_quiet_NaN, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_quiet_NaN, "");
+static_assert(!std::numeric_limits<__int256_t>::has_signaling_NaN, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_signaling_NaN, "");
+
+// --- is_iec559 ---
+static_assert(!std::numeric_limits<__int256_t>::is_iec559, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_iec559, "");
+
+// --- exponent fields ---
+static_assert(std::numeric_limits<__int256_t>::min_exponent == 0, "");
+static_assert(std::numeric_limits<__int256_t>::max_exponent == 0, "");
+static_assert(std::numeric_limits<__int256_t>::min_exponent10 == 0, "");
+static_assert(std::numeric_limits<__int256_t>::max_exponent10 == 0, "");
+
+// --- round_style ---
+static_assert(std::numeric_limits<__int256_t>::round_style == std::round_toward_zero, "");
+static_assert(std::numeric_limits<__uint256_t>::round_style == std::round_toward_zero, "");
+
+// --- Relationship to __int128 ---
+static_assert(std::numeric_limits<__int256_t>::digits == 2 * std::numeric_limits<__int128_t>::digits + 1, "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 2 * std::numeric_limits<__uint128_t>::digits, "");
+
+// ========================================================================
+// Runtime value checks
+// ========================================================================
+
+int main(int, char**) {
+  // --- unsigned min/max ---
+  {
+    __uint256_t umin = std::numeric_limits<__uint256_t>::min();
+    __uint256_t umax = std::numeric_limits<__uint256_t>::max();
+    __uint256_t ulow = std::numeric_limits<__uint256_t>::lowest();
+
+    // min() for unsigned is 0
+    if (umin != 0)
+      return 1;
+
+    // max() is all-ones (2^256 - 1)
+    if (umax != ~(__uint256_t)0)
+      return 2;
+
+    // lowest() == min() for integers
+    if (ulow != umin)
+      return 3;
+
+    // max + 1 wraps to 0 (unsigned modulo)
+    __uint256_t wrapped = umax + 1;
+    if (wrapped != 0)
+      return 4;
+  }
+
+  // --- signed min/max ---
+  {
+    __int256_t smin = std::numeric_limits<__int256_t>::min();
+    __int256_t smax = std::numeric_limits<__int256_t>::max();
+    __int256_t slow = std::numeric_limits<__int256_t>::lowest();
+
+    // min() is negative (sign bit set)
+    if (smin >= 0)
+      return 5;
+
+    // max() is positive
+    if (smax <= 0)
+      return 6;
+
+    // lowest() == min() for integers
+    if (slow != smin)
+      return 7;
+
+    // min() == -(2^255)
+    // Verify by checking that min() has only the MSB set when viewed as unsigned
+    __uint256_t umin_bits    = (__uint256_t)smin;
+    __uint256_t expected_msb = (__uint256_t)1 << 255;
+    if (umin_bits != expected_msb)
+      return 8;
+
+    // max() == 2^255 - 1
+    // All bits except MSB are set
+    __uint256_t umax_bits = (__uint256_t)smax;
+    if (umax_bits != (expected_msb - 1))
+      return 9;
+
+    // min + max == -1 (two's complement identity)
+    if (smin + smax != -1)
+      return 10;
+  }
+
+  // --- epsilon, denorm_min, infinity, NaN are all zero for integers ---
+  {
+    if (std::numeric_limits<__int256_t>::epsilon() != 0)
+      return 11;
+    if (std::numeric_limits<__int256_t>::round_error() != 0)
+      return 12;
+    if (std::numeric_limits<__int256_t>::infinity() != 0)
+      return 13;
+    if (std::numeric_limits<__int256_t>::quiet_NaN() != 0)
+      return 14;
+    if (std::numeric_limits<__int256_t>::signaling_NaN() != 0)
+      return 15;
+    if (std::numeric_limits<__int256_t>::denorm_min() != 0)
+      return 16;
+  }
+
+  // --- const/volatile qualifiers preserve behavior ---
+  {
+    if (std::numeric_limits<const __uint256_t>::max() != std::numeric_limits<__uint256_t>::max())
+      return 17;
+    if (std::numeric_limits<volatile __int256_t>::min() != std::numeric_limits<__int256_t>::min())
+      return 18;
+    if (std::numeric_limits<const volatile __uint256_t>::digits != 256)
+      return 19;
+  }
+
+  // --- Cross-check with __int128 ---
+  {
+    // max(__uint256_t) > max(__uint128_t)
+    __uint256_t u256_max = std::numeric_limits<__uint256_t>::max();
+    __uint128_t u128_max = std::numeric_limits<__uint128_t>::max();
+    if (u256_max <= (__uint256_t)u128_max)
+      return 20;
+
+    // The upper 128 bits of max(__uint256_t) should be max(__uint128_t)
+    __uint128_t upper = (__uint128_t)(u256_max >> 128);
+    if (upper != u128_max)
+      return 21;
+  }
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp b/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
index f1036b3929f06..d77b1dd626bd6 100644
--- a/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
@@ -112,6 +112,10 @@ int main(int, char**)
 #ifndef TEST_HAS_NO_INT128
   check_integral_types<__int128_t, __int128_t>();
   check_integral_types<__uint128_t, __uint128_t>();
+#endif
+#ifndef TEST_HAS_NO_INT256
+  check_integral_types<__int256_t, __int256_t>();
+  check_integral_types<__uint256_t, __uint256_t>();
 #endif
     // TODO(ericwf): Not standard
   typedef std::underlying_type<enum1>::type Enum1UT;
diff --git a/libcxx/test/libcxx/type_traits/int256.pass.cpp b/libcxx/test/libcxx/type_traits/int256.pass.cpp
new file mode 100644
index 0000000000000..a50fbb4959ffd
--- /dev/null
+++ b/libcxx/test/libcxx/type_traits/int256.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test type traits support for __int256_t / __uint256_t
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// is_integral
+static_assert(std::is_integral<__int256_t>::value, "");
+static_assert(std::is_integral<__uint256_t>::value, "");
+static_assert(std::is_integral<const __int256_t>::value, "");
+static_assert(std::is_integral<volatile __uint256_t>::value, "");
+
+// is_arithmetic (derived from is_integral)
+static_assert(std::is_arithmetic<__int256_t>::value, "");
+static_assert(std::is_arithmetic<__uint256_t>::value, "");
+
+// is_signed / is_unsigned
+static_assert(std::is_signed<__int256_t>::value, "");
+static_assert(!std::is_unsigned<__int256_t>::value, "");
+static_assert(!std::is_signed<__uint256_t>::value, "");
+static_assert(std::is_unsigned<__uint256_t>::value, "");
+
+// is_fundamental
+static_assert(std::is_fundamental<__int256_t>::value, "");
+static_assert(std::is_fundamental<__uint256_t>::value, "");
+
+// is_scalar
+static_assert(std::is_scalar<__int256_t>::value, "");
+static_assert(std::is_scalar<__uint256_t>::value, "");
+
+// make_signed / make_unsigned
+static_assert(std::is_same<std::make_signed<__uint256_t>::type, __int256_t>::value, "");
+static_assert(std::is_same<std::make_signed<__int256_t>::type, __int256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned<__int256_t>::type, __uint256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned<__uint256_t>::type, __uint256_t>::value, "");
+
+#  if TEST_STD_VER >= 14
+static_assert(std::is_same<std::make_signed_t<__uint256_t>, __int256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned_t<__int256_t>, __uint256_t>::value, "");
+#  endif
+
+// numeric_limits
+static_assert(std::numeric_limits<__int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__int256_t>::is_integer, "");
+static_assert(std::numeric_limits<__uint256_t>::is_integer, "");
+static_assert(std::numeric_limits<__int256_t>::is_signed, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_signed, "");
+static_assert(std::numeric_limits<__int256_t>::digits == 255, ""); // 256 - 1 sign bit
+static_assert(std::numeric_limits<__uint256_t>::digits == 256, "");
+static_assert(std::numeric_limits<__int256_t>::is_exact, "");
+static_assert(std::numeric_limits<__uint256_t>::radix == 2, "");
+
+// sizeof
+static_assert(sizeof(__int256_t) == 32, "");
+static_assert(sizeof(__uint256_t) == 32, "");
+
+// Comparison with __int128
+static_assert(sizeof(__int256_t) == 2 * sizeof(__int128_t), "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 2 * std::numeric_limits<__uint128_t>::digits, "");
+
+int main(int, char**) {
+  // Runtime basic sanity
+  __int256_t a  = 42;
+  __uint256_t b = 100;
+  __int256_t c  = a + (__int256_t)b;
+  (void)c;
+
+  // make_signed / make_unsigned runtime
+  std::make_unsigned<__int256_t>::type u = 1;
+  std::make_signed<__uint256_t>::type s  = -1;
+  (void)u;
+  (void)s;
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
index 9bbb85f2fe30c..e0cfd74153344 100644
--- a/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
@@ -105,6 +105,11 @@ constexpr void test() {
     check<true, types::type_list<__int128_t, __uint128_t>>();
 #endif
 
+    // 256-bit types.
+#ifndef TEST_HAS_NO_INT256
+    check<true, types::type_list<__int256_t, __uint256_t>>();
+#endif
+
     // Bool.
     check<true, types::type_list<bool>, types::concatenate_t<types::type_list<bool>, integral_8>>();
 
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
index 8ecca81cdfe58..85497629b3261 100644
--- a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
@@ -21,19 +21,23 @@
 
 static_assert(std::is_same_v<std::underlying_type_t<std::__format::__arg_t>, std::uint8_t>);
 
+// The 128-bit and 256-bit types are unconditionally in the enum to avoid
+// the values depending on the availability of extended integer types.
 static_assert(std::uint8_t(std::__format::__arg_t::__none) == 0);
 static_assert(std::uint8_t(std::__format::__arg_t::__boolean) == 1);
 static_assert(std::uint8_t(std::__format::__arg_t::__char_type) == 2);
 static_assert(std::uint8_t(std::__format::__arg_t::__int) == 3);
 static_assert(std::uint8_t(std::__format::__arg_t::__long_long) == 4);
 static_assert(std::uint8_t(std::__format::__arg_t::__i128) == 5);
-static_assert(std::uint8_t(std::__format::__arg_t::__unsigned) == 6);
-static_assert(std::uint8_t(std::__format::__arg_t::__unsigned_long_long) == 7);
-static_assert(std::uint8_t(std::__format::__arg_t::__u128) == 8);
-static_assert(std::uint8_t(std::__format::__arg_t::__float) == 9);
-static_assert(std::uint8_t(std::__format::__arg_t::__double) == 10);
-static_assert(std::uint8_t(std::__format::__arg_t::__long_double) == 11);
-static_assert(std::uint8_t(std::__format::__arg_t::__const_char_type_ptr) == 12);
-static_assert(std::uint8_t(std::__format::__arg_t::__string_view) == 13);
-static_assert(std::uint8_t(std::__format::__arg_t::__ptr) == 14);
-static_assert(std::uint8_t(std::__format::__arg_t::__handle) == 15);
+static_assert(std::uint8_t(std::__format::__arg_t::__i256) == 6);
+static_assert(std::uint8_t(std::__format::__arg_t::__unsigned) == 7);
+static_assert(std::uint8_t(std::__format::__arg_t::__unsigned_long_long) == 8);
+static_assert(std::uint8_t(std::__format::__arg_t::__u128) == 9);
+static_assert(std::uint8_t(std::__format::__arg_t::__u256) == 10);
+static_assert(std::uint8_t(std::__format::__arg_t::__float) == 11);
+static_assert(std::uint8_t(std::__format::__arg_t::__double) == 12);
+static_assert(std::uint8_t(std::__format::__arg_t::__long_double) == 13);
+static_assert(std::uint8_t(std::__format::__arg_t::__const_char_type_ptr) == 14);
+static_assert(std::uint8_t(std::__format::__arg_t::__string_view) == 15);
+static_assert(std::uint8_t(std::__format::__arg_t::__ptr) == 16);
+static_assert(std::uint8_t(std::__format::__arg_t::__handle) == 17);
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp
new file mode 100644
index 0000000000000..d4bee9ab9f0f7
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp
@@ -0,0 +1,119 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: 32-bit-pointer
+// UNSUPPORTED: gcc
+
+// Decimal formatting of __uint256_t requires division builtins from compiler-rt.
+// ADDITIONAL_COMPILE_FLAGS: --rtlib=compiler-rt
+
+// Test std::format support for __int256_t / __uint256_t
+
+#include <cassert>
+#include <format>
+#include <string>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+  // Basic decimal formatting
+  assert(std::format("{}", (__uint256_t)0) == "0");
+  assert(std::format("{}", (__uint256_t)42) == "42");
+  assert(std::format("{}", (__int256_t)-42) == "-42");
+  assert(std::format("{}", (__int256_t)0) == "0");
+
+  // Large values
+  assert(std::format("{}", (__uint256_t)1 << 64) == "18446744073709551616");
+  assert(std::format("{}", (__uint256_t)1 << 128) == "340282366920938463463374607431768211456");
+
+  // Max value (2^256 - 1)
+  assert(std::format("{}", (__uint256_t)-1) ==
+         "115792089237316195423570985008687907853269984665640564039457584007913129639935");
+
+  // Width and alignment
+  assert(std::format("{:>5}", (__uint256_t)42) == "   42");
+  assert(std::format("{:<5}", (__uint256_t)42) == "42   ");
+  assert(std::format("{:^5}", (__uint256_t)42) == " 42  ");
+
+  // Fill character
+  assert(std::format("{:*>5}", (__uint256_t)42) == "***42");
+  assert(std::format("{:0>5}", (__uint256_t)42) == "00042");
+
+  // Sign
+  assert(std::format("{:+}", (__int256_t)42) == "+42");
+  assert(std::format("{:+}", (__int256_t)-42) == "-42");
+  assert(std::format("{: }", (__int256_t)42) == " 42");
+
+  // Hexadecimal
+  assert(std::format("{:x}", (__uint256_t)255) == "ff");
+  assert(std::format("{:X}", (__uint256_t)255) == "FF");
+  assert(std::format("{:#x}", (__uint256_t)255) == "0xff");
+  assert(std::format("{:#X}", (__uint256_t)255) == "0XFF");
+
+  // Octal
+  assert(std::format("{:o}", (__uint256_t)8) == "10");
+  assert(std::format("{:#o}", (__uint256_t)8) == "010");
+
+  // Binary
+  assert(std::format("{:b}", (__uint256_t)10) == "1010");
+  assert(std::format("{:#b}", (__uint256_t)10) == "0b1010");
+
+  // Zero-padded with width
+  assert(std::format("{:010}", (__uint256_t)42) == "0000000042");
+  assert(std::format("{:010}", (__int256_t)-42) == "-000000042");
+
+  // Comparison with __int128 formatting (should produce identical results
+  // for values that fit in both types)
+  __int128_t i128val = 123456789012345LL;
+  __int256_t i256val = 123456789012345LL;
+  assert(std::format("{}", i128val) == std::format("{}", i256val));
+  assert(std::format("{:+020x}", i128val) == std::format("{:+020x}", i256val));
+
+  // Full-width big-number tests (all 4 x 64-bit limbs populated).
+  // Hex output directly corresponds to the hex digits of the input value.
+  {
+    __uint256_t big = ((__uint256_t)0xAAAABBBBCCCCDDDDULL << 192) | ((__uint256_t)0xEEEEFFFF11112222ULL << 128) |
+                      ((__uint256_t)0x3333444455556666ULL << 64) | (__uint256_t)0x7777888899990000ULL;
+    assert(std::format("{:x}", big) == "aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+    assert(std::format("{:X}", big) == "AAAABBBBCCCCDDDDEEEEFFFF1111222233334444555566667777888899990000");
+    assert(std::format("{:#x}", big) == "0xaaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+    // Width and alignment (64 hex digits, padded to 70)
+    assert(std::format("{:>70x}", big) == "      aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+    assert(std::format("{:*<70x}", big) == "aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000******");
+    // Zero-padded hex with prefix
+    assert(std::format("{:#070x}", big) == "0x0000aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+  }
+
+  // INT256_MIN: -(2^255).
+  // Decimal verified: 2^256 = 11579...9936 (from UINT256_MAX + 1),
+  // so 2^255 = 5789604461865809771178549250434395392663499233282028201972879200395656481
+  //            9968
+  {
+    __uint256_t u_min  = (__uint256_t)1 << 255;
+    __int256_t min_val = (__int256_t)u_min;
+    assert(std::format("{}", min_val) ==
+           "-57896044618658097711785492504343953926634992332820282019728792003956564819968");
+  }
+
+  // Large signed negative value in decimal (all limbs significant)
+  {
+    __int256_t neg = (__int256_t)-42;
+    // Verify hex representation: -42 in hex is "-2a"
+    assert(std::format("{:x}", neg) == "-2a");
+    // Wide format of a negative value
+    assert(std::format("{:+80}", neg) == std::string(77, ' ') + "-42");
+  }
+
+  return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
index cd06c509ffda2..a5fa0900eb7e7 100644
--- a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
@@ -69,6 +69,12 @@ constexpr void test() {
       std::is_same_v<typename std::basic_format_context<
                          OutIt, CharT>::template formatter_type<__uint128_t>,
                      std::formatter<__uint128_t, CharT>>);
+#endif
+#ifndef TEST_HAS_NO_INT256
+  static_assert(std::is_same_v<typename std::basic_format_context< OutIt, CharT>::template formatter_type<__int256_t>,
+                               std::formatter<__int256_t, CharT>>);
+  static_assert(std::is_same_v<typename std::basic_format_context< OutIt, CharT>::template formatter_type<__uint256_t>,
+                               std::formatter<__uint256_t, CharT>>);
 #endif
   static_assert(
       std::is_same_v<typename std::basic_format_context<
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 8d88d6fad7d0b..e518fd9a2dd56 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -435,6 +435,14 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_HAS_NO_INT128
 #endif
 
+#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS
+#  define TEST_HAS_NO_INT256
+#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_HAS_INT256) || !_LIBCPP_HAS_INT256)
+#  define TEST_HAS_NO_INT256
+#elif !defined(__SIZEOF_INT256__)
+#  define TEST_HAS_NO_INT256
+#endif
+
 #if defined(_LIBCPP_VERSION) && !_LIBCPP_HAS_LOCALIZATION
 #  define TEST_HAS_NO_LOCALIZATION
 #endif

>From 7dc5519b4e56e1511c6646f313e5750a78bbb8cc Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:03 +0100
Subject: [PATCH 12/17] [lldb] Add __int256/__uint256 debugger support

Extend LLDB to handle 256-bit integer types:
- Scalar.h/cpp: Add e_sint256/e_uint256 to Scalar::Type enum, extend
  APInt operations for 256-bit width
- lldb-enumerations.h: Add eEncodingSint256/eEncodingUint256
- TypeSystemClang.cpp: Map clang's Int256/UInt256 builtin types to
  LLDB scalar type, handle in GetEncoding/GetFormat/GetBasicTypeFromAST

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 lldb/include/lldb/Utility/Scalar.h            |  4 +++
 lldb/include/lldb/lldb-enumerations.h         |  2 ++
 .../TypeSystem/Clang/TypeSystemClang.cpp      | 28 +++++++++++++++++++
 lldb/source/Utility/Scalar.cpp                | 24 ++++++++++++++++
 4 files changed, 58 insertions(+)

diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h
index dbb260962f1d6..5567c4ff1c671 100644
--- a/lldb/include/lldb/Utility/Scalar.h
+++ b/lldb/include/lldb/Utility/Scalar.h
@@ -179,6 +179,10 @@ class Scalar {
 
   llvm::APInt UInt128(const llvm::APInt &fail_value) const;
 
+  llvm::APInt SInt256(const llvm::APInt &fail_value) const;
+
+  llvm::APInt UInt256(const llvm::APInt &fail_value) const;
+
   float Float(float fail_value = 0.0f) const;
 
   double Double(double fail_value = 0.0) const;
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7ebcb2214e0e4..e4a0306784b21 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -836,6 +836,8 @@ enum BasicType {
   eBasicTypeUnsignedLongLong,
   eBasicTypeInt128,
   eBasicTypeUnsignedInt128,
+  eBasicTypeInt256,
+  eBasicTypeUnsignedInt256,
   eBasicTypeBool,
   eBasicTypeHalf,
   eBasicTypeFloat,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 0984d4d7190e7..354d45210c37d 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -771,6 +771,8 @@ TypeSystemClang::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
       return GetType(ast.UnsignedLongLongTy);
     if (QualTypeMatchesBitSize(bit_size, ast, ast.UnsignedInt128Ty))
       return GetType(ast.UnsignedInt128Ty);
+    if (QualTypeMatchesBitSize(bit_size, ast, ast.UnsignedInt256Ty))
+      return GetType(ast.UnsignedInt256Ty);
     break;
 
   case eEncodingSint:
@@ -786,6 +788,8 @@ TypeSystemClang::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
       return GetType(ast.LongLongTy);
     if (QualTypeMatchesBitSize(bit_size, ast, ast.Int128Ty))
       return GetType(ast.Int128Ty);
+    if (QualTypeMatchesBitSize(bit_size, ast, ast.Int256Ty))
+      return GetType(ast.Int256Ty);
     break;
 
   case eEncodingIEEE754:
@@ -864,6 +868,12 @@ lldb::BasicType TypeSystemClang::GetBasicTypeEnumeration(llvm::StringRef name) {
       {"__int128", eBasicTypeInt128},
       {"unsigned __int128", eBasicTypeUnsignedInt128},
 
+      // "int256"
+      {"__int256_t", eBasicTypeInt256},
+      {"__uint256_t", eBasicTypeUnsignedInt256},
+      {"__int256", eBasicTypeInt256},
+      {"unsigned __int256", eBasicTypeUnsignedInt256},
+
       // "bool"
       {"bool", eBasicTypeBool},
       {"_Bool", eBasicTypeBool},
@@ -2043,6 +2053,10 @@ TypeSystemClang::GetOpaqueCompilerType(clang::ASTContext *ast,
     return ast->Int128Ty.getAsOpaquePtr();
   case eBasicTypeUnsignedInt128:
     return ast->UnsignedInt128Ty.getAsOpaquePtr();
+  case eBasicTypeInt256:
+    return ast->Int256Ty.getAsOpaquePtr();
+  case eBasicTypeUnsignedInt256:
+    return ast->UnsignedInt256Ty.getAsOpaquePtr();
   case eBasicTypeBool:
     return ast->BoolTy.getAsOpaquePtr();
   case eBasicTypeHalf:
@@ -3812,6 +3826,7 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::ULong:
     case clang::BuiltinType::ULongLong:
     case clang::BuiltinType::UInt128:
+    case clang::BuiltinType::UInt256:
     case clang::BuiltinType::Char_S:
     case clang::BuiltinType::SChar:
     case clang::BuiltinType::WChar_S:
@@ -3820,6 +3835,7 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::Long:
     case clang::BuiltinType::LongLong:
     case clang::BuiltinType::Int128:
+    case clang::BuiltinType::Int256:
     case clang::BuiltinType::Float:
     case clang::BuiltinType::Double:
     case clang::BuiltinType::LongDouble:
@@ -4814,6 +4830,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
     case clang::BuiltinType::Long:
     case clang::BuiltinType::LongLong:
     case clang::BuiltinType::Int128:
+    case clang::BuiltinType::Int256:
       return lldb::eEncodingSint;
 
     case clang::BuiltinType::Bool:
@@ -4828,6 +4845,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
     case clang::BuiltinType::ULong:
     case clang::BuiltinType::ULongLong:
     case clang::BuiltinType::UInt128:
+    case clang::BuiltinType::UInt256:
       return lldb::eEncodingUint;
 
     // Fixed point types. Note that they are currently ignored.
@@ -5144,6 +5162,10 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) {
       return lldb::eFormatUnsigned;
     case clang::BuiltinType::Int128:
       return lldb::eFormatDecimal;
+    case clang::BuiltinType::UInt256:
+      return lldb::eFormatUnsigned;
+    case clang::BuiltinType::Int256:
+      return lldb::eFormatDecimal;
     case clang::BuiltinType::Half:
     case clang::BuiltinType::Float:
     case clang::BuiltinType::Double:
@@ -5455,6 +5477,10 @@ TypeSystemClang::GetBasicTypeEnumeration(lldb::opaque_compiler_type_t type) {
         return eBasicTypeInt128;
       case clang::BuiltinType::UInt128:
         return eBasicTypeUnsignedInt128;
+      case clang::BuiltinType::Int256:
+        return eBasicTypeInt256;
+      case clang::BuiltinType::UInt256:
+        return eBasicTypeUnsignedInt256;
 
       case clang::BuiltinType::Half:
         return eBasicTypeHalf;
@@ -6020,6 +6046,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
     case clang::BuiltinType::ULong:
     case clang::BuiltinType::ULongLong:
     case clang::BuiltinType::UInt128:
+    case clang::BuiltinType::UInt256:
     case clang::BuiltinType::Char_S:
     case clang::BuiltinType::SChar:
     case clang::BuiltinType::WChar_S:
@@ -6028,6 +6055,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
     case clang::BuiltinType::Long:
     case clang::BuiltinType::LongLong:
     case clang::BuiltinType::Int128:
+    case clang::BuiltinType::Int256:
     case clang::BuiltinType::Float:
     case clang::BuiltinType::Double:
     case clang::BuiltinType::LongDouble:
diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp
index f2c18cdd896da..f01be182c2d3f 100644
--- a/lldb/source/Utility/Scalar.cpp
+++ b/lldb/source/Utility/Scalar.cpp
@@ -390,6 +390,30 @@ llvm::APInt Scalar::UInt128(const llvm::APInt &fail_value) const {
   return fail_value;
 }
 
+llvm::APInt Scalar::SInt256(const llvm::APInt &fail_value) const {
+  switch (m_type) {
+  case e_void:
+    break;
+  case e_int:
+    return m_integer;
+  case e_float:
+    return ToAPInt(m_float, 256, /*is_unsigned=*/false);
+  }
+  return fail_value;
+}
+
+llvm::APInt Scalar::UInt256(const llvm::APInt &fail_value) const {
+  switch (m_type) {
+  case e_void:
+    break;
+  case e_int:
+    return m_integer;
+  case e_float:
+    return ToAPInt(m_float, 256, /*is_unsigned=*/true);
+  }
+  return fail_value;
+}
+
 float Scalar::Float(float fail_value) const {
   switch (m_type) {
   case e_void:

>From 7f28fdc6e73ab3c4cbc3c3d07f00d859cf82e6a6 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:13 +0100
Subject: [PATCH 13/17] [libc] Add __int256/__uint256 type support

Extend LLVM libc type support infrastructure for 256-bit integers:
- types.h: Define LIBC_TYPES_HAS_INT256, Int256/UInt256 typedefs
- uint128.h: Add Int256/UInt256 to the header (alongside UInt128)
- is_integral.h: Recognize __int256_t/__uint256_t as integral types
- make_signed/unsigned.h: Add 256-bit type mappings
- big_int.h: Specialize BigInt traits for native __int256

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 libc/src/__support/CPP/type_traits/is_integral.h   | 3 +++
 libc/src/__support/CPP/type_traits/make_signed.h   | 4 ++++
 libc/src/__support/CPP/type_traits/make_unsigned.h | 4 ++++
 libc/src/__support/big_int.h                       | 3 +++
 libc/src/__support/macros/properties/types.h       | 5 +++++
 libc/src/__support/uint128.h                       | 8 ++++++++
 6 files changed, 27 insertions(+)

diff --git a/libc/src/__support/CPP/type_traits/is_integral.h b/libc/src/__support/CPP/type_traits/is_integral.h
index 09047cb00bf75..fa83cbcdbff84 100644
--- a/libc/src/__support/CPP/type_traits/is_integral.h
+++ b/libc/src/__support/CPP/type_traits/is_integral.h
@@ -28,6 +28,9 @@ template <typename T> struct is_integral {
 public:
   LIBC_INLINE_VAR static constexpr bool value =
       __is_unqualified_any_of<T,
+#ifdef LIBC_TYPES_HAS_INT256
+                              __int256_t, __uint256_t,
+#endif
 #ifdef LIBC_TYPES_HAS_INT128
                               __int128_t, __uint128_t,
 #endif
diff --git a/libc/src/__support/CPP/type_traits/make_signed.h b/libc/src/__support/CPP/type_traits/make_signed.h
index 00bc6be8fcc18..1f822533d25f2 100644
--- a/libc/src/__support/CPP/type_traits/make_signed.h
+++ b/libc/src/__support/CPP/type_traits/make_signed.h
@@ -33,6 +33,10 @@ struct make_signed<unsigned long long> : type_identity<long long> {};
 template <> struct make_signed<__int128_t> : type_identity<__int128_t> {};
 template <> struct make_signed<__uint128_t> : type_identity<__int128_t> {};
 #endif
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct make_signed<__int256_t> : type_identity<__int256_t> {};
+template <> struct make_signed<__uint256_t> : type_identity<__int256_t> {};
+#endif
 template <typename T> using make_signed_t = typename make_signed<T>::type;
 
 } // namespace cpp
diff --git a/libc/src/__support/CPP/type_traits/make_unsigned.h b/libc/src/__support/CPP/type_traits/make_unsigned.h
index e5f60ae665219..9ed8b8e01b4c0 100644
--- a/libc/src/__support/CPP/type_traits/make_unsigned.h
+++ b/libc/src/__support/CPP/type_traits/make_unsigned.h
@@ -38,6 +38,10 @@ struct make_unsigned<unsigned long long> : type_identity<unsigned long long> {};
 template <> struct make_unsigned<__int128_t> : type_identity<__uint128_t> {};
 template <> struct make_unsigned<__uint128_t> : type_identity<__uint128_t> {};
 #endif
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct make_unsigned<__int256_t> : type_identity<__uint256_t> {};
+template <> struct make_unsigned<__uint256_t> : type_identity<__uint256_t> {};
+#endif
 template <typename T> using make_unsigned_t = typename make_unsigned<T>::type;
 
 } // namespace cpp
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index bb9cefd67b552..e154a43656434 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -38,6 +38,9 @@ template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
 template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
 #ifdef LIBC_TYPES_HAS_INT128
 template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct half_width<__uint256_t> : cpp::type_identity<__uint128_t> {};
+#endif // LIBC_TYPES_HAS_INT256
 #endif // LIBC_TYPES_HAS_INT128
 #endif // LIBC_TYPES_HAS_INT64
 template <typename T> using half_width_t = typename half_width<T>::type;
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index 3259c8a6a1d12..61b991d22eb8f 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -46,6 +46,11 @@
 #define LIBC_TYPES_HAS_INT128
 #endif // defined(__SIZEOF_INT128__)
 
+// int256 / uint256 support
+#if defined(__SIZEOF_INT256__)
+#define LIBC_TYPES_HAS_INT256
+#endif // defined(__SIZEOF_INT256__)
+
 // -- float16 support ---------------------------------------------------------
 // LIBC_TYPES_HAS_FLOAT16 is provided by
 // "include/llvm-libc-macros/float16-macros.h"
diff --git a/libc/src/__support/uint128.h b/libc/src/__support/uint128.h
index 722e79d0802e2..6e40aee314f7c 100644
--- a/libc/src/__support/uint128.h
+++ b/libc/src/__support/uint128.h
@@ -20,4 +20,12 @@ using UInt128 = LIBC_NAMESPACE::UInt<128>;
 using Int128 = LIBC_NAMESPACE::Int<128>;
 #endif // LIBC_TYPES_HAS_INT128
 
+#ifdef LIBC_TYPES_HAS_INT256
+using UInt256 = __uint256_t;
+using Int256 = __int256_t;
+#else
+using UInt256 = LIBC_NAMESPACE::UInt<256>;
+using Int256 = LIBC_NAMESPACE::Int<256>;
+#endif // LIBC_TYPES_HAS_INT256
+
 #endif // LLVM_LIBC_SRC___SUPPORT_UINT128_H

>From c0cd30f00d955c3c4513fa514adc3fce1f982059 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:22 +0100
Subject: [PATCH 14/17] [flang] Add __int256 host type mapping

Map __int256_t to Fortran INTEGER(32) in the host type evaluation
infrastructure, enabling constant folding for 256-bit integer
expressions when the host compiler supports __int256.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 flang/lib/Evaluate/host.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/flang/lib/Evaluate/host.h b/flang/lib/Evaluate/host.h
index fbb52f2886a40..81cbf0b8071ea 100644
--- a/flang/lib/Evaluate/host.h
+++ b/flang/lib/Evaluate/host.h
@@ -129,6 +129,14 @@ template <> struct HostTypeHelper<Type<TypeCategory::Integer, 16>> {
 #endif
 };
 
+template <> struct HostTypeHelper<Type<TypeCategory::Integer, 32>> {
+#if defined(__SIZEOF_INT256__)
+  using Type = __int256_t;
+#else
+  using Type = UnsupportedType;
+#endif
+};
+
 // TODO no mapping to host types are defined currently for 16bits float
 // It should be defined when gcc/clang have a better support for it.
 

>From f7b5b7bbfe71d27a2f4ef5d8658183dfe9e61068 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Thu, 26 Feb 2026 14:49:56 +0100
Subject: [PATCH 15/17] [clang][compiler-rt][llvm] Strengthen __int256 tests
 and documentation

Add new test coverage from PR audit remediation:
- RISC-V 64 ABI test (mirrors x86-64/AArch64 patterns)
- _BitInt(192/200) division routing through __divoi3/__udivoi3
- Compound assignment and increment/decrement codegen
- 32-bit target rejection (i686, armv7)
- SPIR target rejection (spirv64, spirv32)
- Stronger debug info test with parameter/local variable checks

Add documentation comments:
- ppcf128<->i256 conversion gap in TargetLoweringBase.cpp
- Shift builtin non-compiler-called status in ashloi3/lshroi3/ashroi3

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 .../CodeGen/RISCV/riscv64-arguments-int256.c  | 37 ++++++++++++
 clang/test/CodeGen/debug-info-int256.c        | 14 +++++
 clang/test/CodeGen/int256-compound-assign.c   | 56 +++++++++++++++++++
 clang/test/Sema/int256-spir-unsupported.c     |  8 +++
 clang/test/Sema/int256-unsupported-target.c   | 13 +++++
 compiler-rt/lib/builtins/ashloi3.c            |  5 ++
 compiler-rt/lib/builtins/ashroi3.c            |  5 ++
 compiler-rt/lib/builtins/lshroi3.c            |  5 ++
 llvm/lib/CodeGen/TargetLoweringBase.cpp       | 12 ++++
 9 files changed, 155 insertions(+)
 create mode 100644 clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
 create mode 100644 clang/test/CodeGen/int256-compound-assign.c
 create mode 100644 clang/test/Sema/int256-spir-unsupported.c
 create mode 100644 clang/test/Sema/int256-unsupported-target.c

diff --git a/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c b/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
new file mode 100644
index 0000000000000..8b3d97a2b075b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple riscv64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify RISC-V 64 IR generation for __int256_t arguments and returns.
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256(i256 noundef %a)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256u(i256 noundef %a)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} i256 @f_two256(i256 noundef %a, i256 noundef %b)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i64 noundef %x, i256 noundef %a, i32 noundef signext %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// 128-bit vs 256-bit: both returned directly in IR
+// CHECK-LABEL: define{{.*}} i128 @f_ret128(i128 noundef %a)
+__int128_t f_ret128(__int128_t a) { return a; }
+
+// Register exhaustion: 3 i256 args still passed directly (LLVM handles lowering)
+// CHECK-LABEL: define{{.*}} i256 @f_three256(i256 noundef %a, i256 noundef %b, i256 noundef %c)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/indirect
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256: also indirect
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
diff --git a/clang/test/CodeGen/debug-info-int256.c b/clang/test/CodeGen/debug-info-int256.c
index eeee2dddfd7f6..a9c5054902f83 100644
--- a/clang/test/CodeGen/debug-info-int256.c
+++ b/clang/test/CodeGen/debug-info-int256.c
@@ -3,10 +3,24 @@
 
 // Verify DWARF debug info encoding for __int256_t and __uint256_t.
 
+// Global variables
 __int256_t s256;
 __uint256_t u256;
 
+// Function with __int256_t parameter and local variable
+void f(__int256_t param) {
+  __uint256_t local = (__uint256_t)param;
+  (void)local;
+}
+
+// Basic type encoding
 // CHECK-DAG: !DIBasicType(name: "__int256", size: 256, encoding: DW_ATE_signed)
 // CHECK-DAG: !DIBasicType(name: "unsigned __int256", size: 256, encoding: DW_ATE_unsigned)
+
+// Typedef encoding
 // CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__int256_t"
 // CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__uint256_t"
+
+// Function parameter and local variable
+// CHECK-DAG: !DILocalVariable(name: "param", arg: 1,
+// CHECK-DAG: !DILocalVariable(name: "local",
diff --git a/clang/test/CodeGen/int256-compound-assign.c b/clang/test/CodeGen/int256-compound-assign.c
new file mode 100644
index 0000000000000..dbf6855530b8b
--- /dev/null
+++ b/clang/test/CodeGen/int256-compound-assign.c
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify IR generation for __int256_t compound assignment and increment ops.
+// On x86-64, __int256 value params use byval and returns use sret.
+
+// CHECK-LABEL: define{{.*}} void @test_add_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256
+void test_add_assign(__int256_t *p, __int256_t v) { *p += v; }
+
+// CHECK-LABEL: define{{.*}} void @test_sub_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: sub nsw i256
+void test_sub_assign(__int256_t *p, __int256_t v) { *p -= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_mul_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: mul nsw i256
+void test_mul_assign(__int256_t *p, __int256_t v) { *p *= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_shl_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: shl i256
+void test_shl_assign(__int256_t *p, int n) { *p <<= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_shr_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: ashr i256
+void test_shr_assign(__int256_t *p, int n) { *p >>= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_ushr_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: lshr i256
+void test_ushr_assign(__uint256_t *p, int n) { *p >>= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_and_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: and i256
+void test_and_assign(__int256_t *p, __int256_t v) { *p &= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_or_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: or i256
+void test_or_assign(__int256_t *p, __int256_t v) { *p |= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_xor_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: xor i256
+void test_xor_assign(__int256_t *p, __int256_t v) { *p ^= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_pre_inc(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256 %{{.*}}, 1
+__int256_t test_pre_inc(__int256_t a) { return ++a; }
+
+// CHECK-LABEL: define{{.*}} void @test_pre_dec(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256 %{{.*}}, -1
+__int256_t test_pre_dec(__int256_t a) { return --a; }
+
+// CHECK-LABEL: define{{.*}} void @test_post_inc(ptr{{.*}}sret(i256){{.*}}, ptr noundef %p)
+// CHECK: add nsw i256 %{{.*}}, 1
+__int256_t test_post_inc(__int256_t *p) { return (*p)++; }
+
+// CHECK-LABEL: define{{.*}} void @test_post_dec(ptr{{.*}}sret(i256){{.*}}, ptr noundef %p)
+// CHECK: add nsw i256 %{{.*}}, -1
+__int256_t test_post_dec(__int256_t *p) { return (*p)--; }
diff --git a/clang/test/Sema/int256-spir-unsupported.c b/clang/test/Sema/int256-spir-unsupported.c
new file mode 100644
index 0000000000000..c79ba61e46e7a
--- /dev/null
+++ b/clang/test/Sema/int256-spir-unsupported.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-unknown-unknown %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv32-unknown-unknown %s
+
+// Verify __int256 is rejected on SPIR targets (even 64-bit SPIR).
+// On SPIR, the __int256_t typedef is not predefined, so use the keyword.
+
+__int256 x; // expected-error {{__int256 is not supported on this target}}
+unsigned __int256 y; // expected-error {{__int256 is not supported on this target}}
diff --git a/clang/test/Sema/int256-unsupported-target.c b/clang/test/Sema/int256-unsupported-target.c
new file mode 100644
index 0000000000000..2a4203c302c36
--- /dev/null
+++ b/clang/test/Sema/int256-unsupported-target.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7-linux-gnueabihf %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple riscv32-unknown-elf %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple mipsel-linux-gnu %s
+
+// Verify __int256 is rejected on 32-bit targets.
+// On 32-bit, the __int256_t typedef is not predefined, so use the keyword.
+
+__int256 x; // expected-error {{__int256 is not supported on this target}}
+unsigned __int256 y; // expected-error {{__int256 is not supported on this target}}
+
+void f(__int256 a) {} // expected-error {{__int256 is not supported on this target}}
+__int256 g(void); // expected-error {{__int256 is not supported on this target}}
diff --git a/compiler-rt/lib/builtins/ashloi3.c b/compiler-rt/lib/builtins/ashloi3.c
index 9d81628403ab7..5b83542eec264 100644
--- a/compiler-rt/lib/builtins/ashloi3.c
+++ b/compiler-rt/lib/builtins/ashloi3.c
@@ -8,6 +8,11 @@
 //
 // This file implements __ashloi3 for the compiler_rt library.
 //
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
 //===----------------------------------------------------------------------===//
 
 #include "int_lib.h"
diff --git a/compiler-rt/lib/builtins/ashroi3.c b/compiler-rt/lib/builtins/ashroi3.c
index 35b583d47f7cb..1323c4fe12cd8 100644
--- a/compiler-rt/lib/builtins/ashroi3.c
+++ b/compiler-rt/lib/builtins/ashroi3.c
@@ -8,6 +8,11 @@
 //
 // This file implements __ashroi3 for the compiler_rt library.
 //
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
 //===----------------------------------------------------------------------===//
 
 #include "int_lib.h"
diff --git a/compiler-rt/lib/builtins/lshroi3.c b/compiler-rt/lib/builtins/lshroi3.c
index d4e4920bda0a1..b8268d7725229 100644
--- a/compiler-rt/lib/builtins/lshroi3.c
+++ b/compiler-rt/lib/builtins/lshroi3.c
@@ -8,6 +8,11 @@
 //
 // This file implements __lshroi3 for the compiler_rt library.
 //
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
 //===----------------------------------------------------------------------===//
 
 #include "int_lib.h"
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 355063a91ec40..d5dbfc5fcce40 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -412,6 +412,9 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
       return FPTOSINT_PPCF128_I64;
     if (RetVT == MVT::i128)
       return FPTOSINT_PPCF128_I128;
+    // Note: ppcf128 -> i256 conversion is not yet supported.
+    // ppc_fp128 uses a unique double-double representation that requires
+    // dedicated builtins. Falls back to expansion through smaller types.
   }
   return UNKNOWN_LIBCALL;
 }
@@ -469,6 +472,9 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
       return FPTOUINT_PPCF128_I64;
     if (RetVT == MVT::i128)
       return FPTOUINT_PPCF128_I128;
+    // Note: ppcf128 -> i256 conversion is not yet supported.
+    // ppc_fp128 uses a unique double-double representation that requires
+    // dedicated builtins. Falls back to expansion through smaller types.
   }
   return UNKNOWN_LIBCALL;
 }
@@ -526,6 +532,9 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
       return SINTTOFP_I256_F80;
     if (RetVT == MVT::f128)
       return SINTTOFP_I256_F128;
+    // Note: i256 -> ppcf128 conversion is not yet supported.
+    // ppc_fp128 uses a unique double-double representation that requires
+    // dedicated builtins. Falls back to expansion through smaller types.
   }
   return UNKNOWN_LIBCALL;
 }
@@ -583,6 +592,9 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
       return UINTTOFP_I256_F80;
     if (RetVT == MVT::f128)
       return UINTTOFP_I256_F128;
+    // Note: i256 -> ppcf128 conversion is not yet supported.
+    // ppc_fp128 uses a unique double-double representation that requires
+    // dedicated builtins. Falls back to expansion through smaller types.
   }
   return UNKNOWN_LIBCALL;
 }

>From 171bc3c248ecd61049af7011e8ba8f39bfdc09f5 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Thu, 26 Feb 2026 15:08:02 +0100
Subject: [PATCH 16/17] [clang][lldb][test] Add LLDB unit tests and expand
 atomic __int256 tests

Add LLDB unit tests for __int256 support:
- ScalarTest: SInt256/UInt256 getter tests using 2^200 (201-bit value)
- TestTypeSystemClang: eBasicTypeInt256/UnsignedInt256 enum-to-type
  mapping and name-to-type lookup (__int256_t, __uint256_t, etc.)

Expand atomic-int256.c Sema test with:
- __c11_atomic_exchange (signed/unsigned)
- __c11_atomic_compare_exchange_strong (signed/unsigned)
- __c11_atomic_compare_exchange_weak (signed/unsigned)
- Multiple memory orderings (RELAXED, ACQUIRE, RELEASE, ACQ_REL)

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 clang/test/Sema/atomic-int256.c               | 47 +++++++++++++++++++
 lldb/unittests/Symbol/TestTypeSystemClang.cpp | 10 ++++
 lldb/unittests/Utility/ScalarTest.cpp         |  9 ++++
 3 files changed, 66 insertions(+)

diff --git a/clang/test/Sema/atomic-int256.c b/clang/test/Sema/atomic-int256.c
index 6257338e50ad4..c9d1051ba2ede 100644
--- a/clang/test/Sema/atomic-int256.c
+++ b/clang/test/Sema/atomic-int256.c
@@ -24,3 +24,50 @@ __uint256_t load_atomic_unsigned(void) {
 void store_atomic_unsigned(__uint256_t val) {
   __c11_atomic_store(&atomic_u256, val, __ATOMIC_SEQ_CST);
 }
+
+// Atomic exchange
+__int256_t exchange_atomic(__int256_t val) {
+  return __c11_atomic_exchange(&atomic_s256, val, __ATOMIC_SEQ_CST);
+}
+
+__uint256_t exchange_atomic_unsigned(__uint256_t val) {
+  return __c11_atomic_exchange(&atomic_u256, val, __ATOMIC_RELAXED);
+}
+
+// Atomic compare-exchange (strong and weak)
+_Bool cas_strong(__int256_t *expected, __int256_t desired) {
+  return __c11_atomic_compare_exchange_strong(
+      &atomic_s256, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
+
+_Bool cas_weak(__int256_t *expected, __int256_t desired) {
+  return __c11_atomic_compare_exchange_weak(
+      &atomic_s256, expected, desired, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+}
+
+_Bool cas_strong_unsigned(__uint256_t *expected, __uint256_t desired) {
+  return __c11_atomic_compare_exchange_strong(
+      &atomic_u256, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+_Bool cas_weak_unsigned(__uint256_t *expected, __uint256_t desired) {
+  return __c11_atomic_compare_exchange_weak(
+      &atomic_u256, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
+
+// Different memory orderings for load/store
+__int256_t load_relaxed(void) {
+  return __c11_atomic_load(&atomic_s256, __ATOMIC_RELAXED);
+}
+
+__int256_t load_acquire(void) {
+  return __c11_atomic_load(&atomic_s256, __ATOMIC_ACQUIRE);
+}
+
+void store_relaxed(__int256_t val) {
+  __c11_atomic_store(&atomic_s256, val, __ATOMIC_RELAXED);
+}
+
+void store_release(__int256_t val) {
+  __c11_atomic_store(&atomic_s256, val, __ATOMIC_RELEASE);
+}
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index 76abdd12d32a6..a18e7b632d80d 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -89,6 +89,8 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
       context.hasSameType(GetBasicQualType(eBasicTypeInt), context.IntTy));
   EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeInt128),
                                   context.Int128Ty));
+  EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeInt256),
+                                  context.Int256Ty));
   EXPECT_TRUE(
       context.hasSameType(GetBasicQualType(eBasicTypeLong), context.LongTy));
   EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeLongDouble),
@@ -116,6 +118,8 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
                                   context.UnsignedIntTy));
   EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedInt128),
                                   context.UnsignedInt128Ty));
+  EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedInt256),
+                                  context.UnsignedInt256Ty));
   EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedLong),
                                   context.UnsignedLongTy));
   EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedLongLong),
@@ -171,6 +175,12 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromName) {
   EXPECT_EQ(GetBasicQualType(eBasicTypeInt128), GetBasicQualType("__int128"));
   EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt128),
             GetBasicQualType("unsigned __int128"));
+  EXPECT_EQ(GetBasicQualType(eBasicTypeInt256), GetBasicQualType("__int256_t"));
+  EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt256),
+            GetBasicQualType("__uint256_t"));
+  EXPECT_EQ(GetBasicQualType(eBasicTypeInt256), GetBasicQualType("__int256"));
+  EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt256),
+            GetBasicQualType("unsigned __int256"));
   EXPECT_EQ(GetBasicQualType(eBasicTypeVoid), GetBasicQualType("void"));
   EXPECT_EQ(GetBasicQualType(eBasicTypeBool), GetBasicQualType("bool"));
   EXPECT_EQ(GetBasicQualType(eBasicTypeFloat), GetBasicQualType("float"));
diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp
index 869a5809e6d14..6b3a1604b61e4 100644
--- a/lldb/unittests/Utility/ScalarTest.cpp
+++ b/lldb/unittests/Utility/ScalarTest.cpp
@@ -112,6 +112,15 @@ TEST(ScalarTest, Getters) {
             Scalar(-std::pow(2.0, 70.0)).SInt128(APInt()));
   EXPECT_EQ(APInt(128, 1) << 70, Scalar(std::pow(2.0, 70.0)).UInt128(APInt()));
   EXPECT_EQ(APInt(128, 0), Scalar(-std::pow(2.0, 70.0)).UInt128(APInt()));
+
+  // Int256: use double (not float) since 2^200 exceeds float range (~3.4e38)
+  EXPECT_EQ(APInt(256, 1) << 200,
+            Scalar(std::pow(2.0, 200.0)).SInt256(APInt()));
+  EXPECT_EQ(APInt(256, -1, true) << 200,
+            Scalar(-std::pow(2.0, 200.0)).SInt256(APInt()));
+  EXPECT_EQ(APInt(256, 1) << 200,
+            Scalar(std::pow(2.0, 200.0)).UInt256(APInt()));
+  EXPECT_EQ(APInt(256, 0), Scalar(-std::pow(2.0, 200.0)).UInt256(APInt()));
 }
 
 TEST(ScalarTest, RightShiftOperator) {

>From b0dc0130b2bb35aa0da647f018ae409b6b88e174 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Sun, 1 Mar 2026 18:04:59 +0100
Subject: [PATCH 17/17] [RISCV] Reject tail calls when arguments are passed
 indirectly

When a function argument is passed indirectly (CCValAssign::Indirect),
the caller allocates stack space for the value and passes a pointer to
the callee. If the call is tail-called, the caller's frame is popped
before the callee executes, leaving the pointer dangling -- a
use-after-free on the stack.

X86 already guards against this case in its
isEligibleForTailCallOptimization (X86ISelLoweringCall.cpp:3070-3071).
The RISC-V implementation was missing this check.

The existing test tail-calls.ll had a comment "Do not tail call optimize
if parameters need to be passed indirectly" but the CHECK lines showed
the call being tail-called anyway (the test was auto-generated from the
buggy codegen). After the fix, the test correctly shows a normal call
with the stack frame remaining live across it.

This affects any type requiring indirect passing on RISC-V, such as
fp128 on RV32, and is not specific to any particular frontend type.

Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  8 ++++++++
 llvm/test/CodeGen/RISCV/tail-calls.ll       | 18 ++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a8542be937a87..aa1db300f8df3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24515,6 +24515,14 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
   if (Caller.hasFnAttribute("interrupt"))
     return false;
 
+  // Do not tail call optimize if any parameters need to be passed indirectly.
+  // The caller allocates stack space for the indirect argument and passes a
+  // pointer to the callee. A tail call pops the caller's frame before the
+  // callee executes, invalidating the pointer.
+  for (const auto &ArgLoc : ArgLocs)
+    if (ArgLoc.getLocInfo() == CCValAssign::Indirect)
+      return false;
+
   // If the stack arguments for this call do not fit into our own save area then
   // the call cannot be made tail.
   if (CCInfo.getStackSize() > RVFI->getArgumentStackSize())
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 33feba3c6fba1..79855aa03adcf 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -247,20 +247,24 @@ declare i32 @callee_indirect_args(fp128 %a)
 define void @caller_indirect_args() nounwind {
 ; CHECK-LABEL: caller_indirect_args:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lui a1, 262128
 ; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    sw zero, 0(sp)
 ; CHECK-NEXT:    sw zero, 4(sp)
 ; CHECK-NEXT:    sw zero, 8(sp)
 ; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    tail callee_indirect_args
+; CHECK-NEXT:    call callee_indirect_args
+; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 32
+; CHECK-NEXT:    ret
 ;
 ; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_args:
 ; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK-LARGE-ZICFILP-NEXT:    lui a1, 262128
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi9:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI7_0)
@@ -270,8 +274,10 @@ define void @caller_indirect_args() nounwind {
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 4(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 8(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw a1, 12(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
-; CHECK-LARGE-ZICFILP-NEXT:    jr t2
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %call = tail call i32 @callee_indirect_args(fp128 0xL00000000000000003FFF000000000000)
   ret void



More information about the cfe-commits mailing list