[clang] d4bdeca - [X86] Support AMX fast register allocation
Xiang1 Zhang via cfe-commits
cfe-commits at lists.llvm.org
Fri May 7 23:21:29 PDT 2021
Author: Xiang1 Zhang
Date: 2021-05-08T14:21:11+08:00
New Revision: d4bdeca5765ac2e81e217a5fa873d1ffbf0e95b0
URL: https://github.com/llvm/llvm-project/commit/d4bdeca5765ac2e81e217a5fa873d1ffbf0e95b0
DIFF: https://github.com/llvm/llvm-project/commit/d4bdeca5765ac2e81e217a5fa873d1ffbf0e95b0.diff
LOG: [X86] Support AMX fast register allocation
Differential Revision: https://reviews.llvm.org/D100026
Added:
llvm/lib/Target/X86/X86FastTileConfig.cpp
llvm/lib/Target/X86/X86PreAMXConfig.cpp
llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll
llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir
Modified:
clang/include/clang/Basic/BuiltinsX86_64.def
llvm/include/llvm/CodeGen/Passes.h
llvm/include/llvm/CodeGen/TargetPassConfig.h
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/CodeGen/TargetPassConfig.cpp
llvm/lib/Target/X86/CMakeLists.txt
llvm/lib/Target/X86/X86.h
llvm/lib/Target/X86/X86ExpandPseudo.cpp
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
llvm/lib/Target/X86/X86LowerAMXType.cpp
llvm/lib/Target/X86/X86TargetMachine.cpp
llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
llvm/test/CodeGen/X86/O0-pipeline.ll
llvm/tools/opt/opt.cpp
llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 731f17452cbe..57bf1b477d10 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -101,6 +101,7 @@ TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
// AMX internal builtin
+TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 88e222093824..b823392c111d 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -507,6 +507,9 @@ namespace llvm {
/// or split the data to two <128 x i32>.
FunctionPass *createX86LowerAMXTypePass();
+ /// The pass insert tile config intrinsics for AMX fast register allocation.
+ FunctionPass *createX86PreAMXConfigPass();
+
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
FunctionPass *createX86LowerAMXIntrinsicsPass();
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index 1511045e4688..11138039a3c5 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -406,6 +406,10 @@ class TargetPassConfig : public ImmutablePass {
return false;
}
+ /// addPostFastRegAllocRewrite - Add passes to the optimized register
+ /// allocation pipeline after fast register allocation is complete.
+ virtual bool addPostFastRegAllocRewrite() { return false; }
+
/// Add passes to be run immediately after virtual registers are rewritten
/// to physical registers.
virtual void addPostRewrite() { }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 643018b0eedb..aa38fd3ca803 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5042,6 +5042,9 @@ let TargetPrefix = "x86" in {
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
ImmArg<ArgIndex<2>>]>;
// AMX - internal intrinsics
+ def int_x86_ldtilecfg_internal :
+ GCCBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
+ Intrinsic<[], [llvm_ptr_ty], []>;
def int_x86_tileloadd64_internal :
GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
Intrinsic<[llvm_x86amx_ty],
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index b5e8f9e91c2a..f5a016e94917 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1321,6 +1321,10 @@ bool TargetPassConfig::addRegAssignAndRewriteFast() {
report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
addPass(createRegAllocPass(false));
+
+ // Allow targets to change the register assignments after
+ // fast register allocation.
+ addPostFastRegAllocRewrite();
return true;
}
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 09ffc2ee7187..a2816f6e5e84 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -34,8 +34,10 @@ set(sources
X86DiscriminateMemOps.cpp
X86LowerTileCopy.cpp
X86LowerAMXType.cpp
+ X86PreAMXConfig.cpp
X86LowerAMXIntrinsics.cpp
X86TileConfig.cpp
+ X86FastTileConfig.cpp
X86PreTileConfig.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 0240dc77a1ee..eba5b6ce7836 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -79,6 +79,9 @@ FunctionPass *createX86WinAllocaExpander();
/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();
+/// Return a pass that config the tile registers after fast reg allocation.
+FunctionPass *createX86FastTileConfigPass();
+
/// Return a pass that insert pseudo tile config instruction.
FunctionPass *createX86PreTileConfigPass();
@@ -172,8 +175,10 @@ void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86FastTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86PreAMXConfigPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 29a7b9840f44..ce794ba46fdc 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -478,6 +478,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
+ case X86::PLDTILECFGV: {
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
+ }
case X86::PTILELOADDV: {
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
new file mode 100644
index 000000000000..d254928fa1dd
--- /dev/null
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -0,0 +1,306 @@
+//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. Before FastRegAllocation pass
+/// the ldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fasttileconfig"
+
+namespace {
+
+class X86FastTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86FastTileConfig() : MachineFunctionPass(ID) {}
+
+ bool fastTileConfig();
+ bool isTileLoad(MachineInstr &MI);
+ bool isTileStore(MachineInstr &MI);
+ bool isAMXInstr(MachineInstr &MI);
+ void getTileStoreShape(MachineInstr &MI,
+ SmallVector<MachineOperand *> &ShapedTiles);
+
+ MachineInstr *getKeyAMXInstr(MachineInstr *MI);
+ void getTileShapesCfg(MachineInstr *MI,
+ SmallVector<MachineOperand *> &ShapedTiles);
+ void getShapeCfgInstrs(MachineInstr *MI,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs);
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Fast Tile Register Configure";
+ }
+
+ void materializeTileCfg(MachineInstr *MI);
+
+ void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs);
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &MFunc) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86FastTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Configure", false, false)
+INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Configure", false, false)
+
+static bool isTilePhysReg(MachineOperand &Op) {
+ if (!Op.isReg())
+ return false;
+
+ Register Reg = Op.getReg();
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return true;
+ return false;
+}
+
+static unsigned getTilePhysRegIdx(MachineOperand *Op) {
+ assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
+ return Op->getReg() - X86::TMM0;
+}
+
+static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
+ unsigned Offset = 48 + TIdx;
+ MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
+ unsigned Offset = 16 + TIdx * 2;
+ MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
+ return MI.getOpcode() == X86::PTILELOADDV;
+}
+bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
+ return MI.getOpcode() == X86::PTILESTOREDV;
+}
+bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
+ // TODO: May need to handle some special nontile amx instrucion.
+ if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
+ return false;
+
+ for (MachineOperand &MO : MI.operands())
+ if (isTilePhysReg(MO))
+ return true;
+
+ return false;
+}
+
+MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
+ auto Cfg = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *KeyMI = nullptr;
+ int KeyAMXNum = 0;
+
+ for (auto II = Cfg; II != MBB->end(); II++) {
+ if (isTileLoad(*II)) {
+ KeyMI = &*II;
+ continue;
+ }
+
+ if (isTileStore(*II)) {
+ assert(KeyMI && "Key AMX Should be found before!");
+ break;
+ }
+
+ if (isAMXInstr(*II)) {
+ assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
+ KeyAMXNum++;
+ KeyMI = &*II;
+ }
+ }
+ assert(KeyMI && "There must be an AMX instruction.");
+ return KeyMI;
+}
+
+// Orderly get the tiles in key amx instruction, uses before defs.
+void X86FastTileConfig::getTileShapesCfg(
+ MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
+ MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
+
+ SmallVector<MachineOperand *> DefTiles;
+ for (MachineOperand &MO : KeyMI->operands()) {
+ if (!isTilePhysReg(MO))
+ continue;
+ if (MO.isDef())
+ DefTiles.push_back(&MO);
+ else
+ ShapedTiles.push_back(&MO);
+ }
+ ShapedTiles.append(DefTiles);
+}
+
+// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
+// amx.shape.N.col*" at pass "Pre AMX Tile Config".
+// The 'N' implies the order of tiles in key amx intrinsic.
+void X86FastTileConfig::getShapeCfgInstrs(
+ MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs) {
+ auto Cfg = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock *MBB = MI->getParent();
+
+ for (auto II = Cfg; II != MBB->begin(); II--) {
+ if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
+ break;
+ if (!II->mayStore() || !II->hasOneMemOperand())
+ continue;
+ const Value *MemPtr = II->memoperands()[0]->getValue();
+ if (!MemPtr)
+ continue;
+
+ StringRef Name = MemPtr->getName();
+ if (!Name.startswith("amx.tmm."))
+ continue;
+
+ // Get the 'N'th tile shape config in key amx instruction.
+ auto N = Name.find(".shape");
+ StringRef STileIdx = Name.slice(8, N);
+ unsigned Idx;
+ STileIdx.getAsInteger(10, Idx);
+
+ // And related them with their store instructions.
+ if (Name.contains("row"))
+ RowCfgs[Idx] = &*II;
+ else if (Name.contains("col"))
+ ColCfgs[Idx] = &*II;
+ else
+ llvm_unreachable("Invalid tile shape info!");
+ }
+ assert((RowCfgs.size() == ColCfgs.size()) &&
+ "The number of tile row and col must be equal!");
+}
+
+// Here is the data format for the tile config.
+// 0 palette = 1 now.
+// 1 start_row = 0 now.
+// 2-15 reserved, must be zero
+// 16-17 tile0.colsb Tile 0 bytes per row.
+// 18-19 tile1.colsb Tile 1 bytes per row.
+// 20-21 tile2.colsb Tile 2 bytes per row.
+// ... (sequence continues)
+// 30-31 tile7.colsb Tile 7 bytes per row.
+// 32-47 reserved, must be zero
+// 48 tile0.rows Tile 0 rows.
+// 49 tile1.rows Tile 1 rows.
+// 50 tile2.rows Tile 2 rows.
+// ... (sequence continues)
+// 55 tile7.rows Tile 7 rows.
+// 56-63 reserved, must be zero
+void X86FastTileConfig::rewriteTileCfg(
+ SmallVector<MachineOperand *> &ShapedTiles,
+ std::map<unsigned, MachineInstr *> &RowCfgs,
+ std::map<unsigned, MachineInstr *> &ColCfgs) {
+ assert((RowCfgs.size() == ShapedTiles.size()) &&
+ "The number of tile shapes not equal with the number of tiles!");
+
+ // Orderly get the tiles and adjust the shape config.
+ for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
+ MachineOperand *MO = ShapedTiles[I];
+ unsigned TmmIdx = getTilePhysRegIdx(MO);
+ if (I == TmmIdx)
+ continue;
+ adjustRowCfg(TmmIdx, RowCfgs[I]);
+ adjustColCfg(TmmIdx, ColCfgs[I]);
+ }
+}
+
+// We have already preconfig the shapes before fast register allocation at
+// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
+// allocation, the shapes pre-written before may not rightly corresponding
+// to the correct tmm registers, so we need adjust them.
+void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
+ SmallVector<MachineOperand *> ShapedTiles;
+ std::map<unsigned, MachineInstr *> RowCfgs;
+ std::map<unsigned, MachineInstr *> ColCfgs;
+
+ // Orderly keep the tile uses and def in ShapedTiles;
+ getTileShapesCfg(CfgMI, ShapedTiles);
+ assert(ShapedTiles.size() && "Not find shapes config!");
+
+ getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
+
+ rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
+}
+
+bool X86FastTileConfig::fastTileConfig() {
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ SmallVector<MachineInstr *, 2> CFGs;
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == X86::PLDTILECFGV)
+ CFGs.push_back(&MI);
+ for (auto *MI : CFGs)
+ materializeTileCfg(MI);
+ if (!CFGs.empty())
+ Changed = true;
+ }
+ return Changed;
+}
+
+bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+ MF = &MFunc;
+ MRI = &MFunc.getRegInfo();
+ ST = &MFunc.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = MFunc.getSubtarget().getInstrInfo();
+
+ return fastTileConfig();
+}
+
+FunctionPass *llvm::createX86FastTileConfigPass() {
+ return new X86FastTileConfig();
+}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index dd1590f6d589..b83856cae723 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,6 +48,8 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
+ def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
+ [(int_x86_ldtilecfg_internal addr:$src)]>;
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index f561c8457b08..248069f4deb4 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -34,6 +34,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -52,6 +53,10 @@ static bool isV256I32Ty(Type *Ty) {
}
#endif
+static cl::opt<bool>
+ X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden,
+ cl::desc("X86: enable AMX scalarizition."));
+
namespace {
class X86LowerAMXIntrinsics {
Function &Func;
@@ -93,6 +98,7 @@ class X86LowerAMXIntrinsics {
lowerTileDP(Instruction *TileDP);
bool lowerTileZero(Instruction *TileZero);
};
+} // anonymous namespace
BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader,
BasicBlock *Exit, Value *Bound,
@@ -624,9 +630,6 @@ bool X86LowerAMXIntrinsics::visit() {
return C;
}
-} // anonymous namespace
-
-namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
@@ -638,6 +641,8 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
}
bool runOnFunction(Function &F) override {
+ if (!X86ScalarizeAMX)
+ return false;
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
TM->getOptLevel() != CodeGenOpt::None)
@@ -661,8 +666,6 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
}
};
-} // anonymous namespace
-
static const char PassName[] = "Lower AMX intrinsics";
char X86LowerAMXIntrinsicsLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 2150a9d611bc..378ebc84c733 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -14,6 +14,27 @@
/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
/// not be combined with load/store, we transform the bitcast to amx load/store
/// and <256 x i32> store/load.
+///
+/// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S
+/// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile,
+/// because that is necessary for AMX fast register allocation. (In Fast
+/// registera allocation, register will be allocated before spill/reload, so
+/// there is no additional register for amx to identify the step in spill.)
+/// The volatileTileData() will handle this case.
+/// e.g.
+/// ----------------------------------------------------------
+/// | def %td = ... |
+/// | ... |
+/// | "use %td" |
+/// ----------------------------------------------------------
+/// will transfer to -->
+/// ----------------------------------------------------------
+/// | def %td = ... |
+/// | call void @llvm.x86.tilestored64.internal(mem, %td) |
+/// | ... |
+/// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)|
+/// | "use %td2" |
+/// ----------------------------------------------------------
//
//===----------------------------------------------------------------------===//
//
@@ -41,7 +62,8 @@ using namespace PatternMatch;
#define DEBUG_TYPE "lower-amx-type"
-static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
+ BasicBlock *BB) {
Function &F = *BB->getParent();
Module *M = BB->getModule();
const DataLayout &DL = M->getDataLayout();
@@ -56,7 +78,44 @@ static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
return AllocaRes;
}
-static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+ TargetMachine *TM = nullptr;
+
+ // In AMX intrinsics we let Shape = {Row, Col}, but the
+ // RealCol = Col / ElementSize. We may use the RealCol
+ // as a new Row for other new created AMX intrinsics.
+ std::map<Value *, Value *> Col2Row;
+
+public:
+ X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {}
+ bool visit();
+ void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
+ void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
+ bool transformBitcast(BitCastInst *Bitcast);
+ std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
+ Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
+};
+
+Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V,
+ unsigned Granularity) {
+ if (Col2Row.count(V))
+ return Col2Row[V];
+ IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator Iter = I->getIterator();
+ ++Iter;
+ Builder.SetInsertPoint(&*Iter);
+ }
+ ConstantInt *Gran = Builder.getInt16(Granularity);
+ Value *RealRow = Builder.CreateUDiv(V, Gran);
+ Col2Row[V] = RealRow;
+ return RealRow;
+}
+
+std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
+ unsigned OpNo) {
Value *Row = nullptr, *Col = nullptr;
switch (II->getIntrinsicID()) {
default:
@@ -85,6 +144,13 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
break;
case 5:
Row = II->getArgOperand(2);
+ // FIXME: There is a design bug for AMX shape, which the Col should be
+ // Col/4 if it will be used as Row, but current Greedy RA can't handle
+ // this case well, it may failed if we generate a new Shape definition.
+ // So Let's just do it in O0 first.
+ // Row = Row / 4
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ Row = getRowFromCol(II, Row, 4);
Col = II->getArgOperand(1);
break;
}
@@ -100,7 +166,7 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
// -->
// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
// i8* %addr, i64 %stride64)
-static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
Value *Row = nullptr, *Col = nullptr;
Use &U = *(Bitcast->use_begin());
unsigned OpNo = U.getOperandNo();
@@ -125,7 +191,7 @@ static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
// -->
// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
// %stride64, %13)
-static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
Value *Tile = Bitcast->getOperand(0);
auto *II = cast<IntrinsicInst>(Tile);
@@ -157,14 +223,14 @@ static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
}
// transform bitcast to <store, load> instructions.
-static bool transformBitcast(BitCastInst *Bitcast) {
+bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
IRBuilder<> Builder(Bitcast);
AllocaInst *AllocaAddr;
Value *I8Ptr, *Stride;
auto *Src = Bitcast->getOperand(0);
auto Prepare = [&]() {
- AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+ AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent());
I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
Stride = Builder.getInt64(64);
};
@@ -215,17 +281,9 @@ static bool transformBitcast(BitCastInst *Bitcast) {
return true;
}
-namespace {
-class X86LowerAMXType {
- Function &Func;
-
-public:
- X86LowerAMXType(Function &F) : Func(F) {}
- bool visit();
-};
-
bool X86LowerAMXType::visit() {
SmallVector<Instruction *, 8> DeadInsts;
+ Col2Row.clear();
for (BasicBlock *BB : post_order(&Func)) {
for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
@@ -322,6 +380,260 @@ bool X86LowerAMXType::visit() {
}
} // anonymous namespace
+static Value *getAllocaPos(BasicBlock *BB) {
+ Module *M = BB->getModule();
+ Function *F = BB->getParent();
+ IRBuilder<> Builder(&F->getEntryBlock().front());
+ const DataLayout &DL = M->getDataLayout();
+ unsigned AllocaAS = DL.getAllocaAddrSpace();
+ Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+ AllocaInst *AllocaRes =
+ new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
+ BasicBlock::iterator Iter = AllocaRes->getIterator();
+ ++Iter;
+ Builder.SetInsertPoint(&*Iter);
+ Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy());
+ return I8Ptr;
+}
+
+static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
+ assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
+ auto *II = cast<IntrinsicInst>(TileDef);
+ assert(II && "Not tile intrinsic!");
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+
+ BasicBlock *BB = TileDef->getParent();
+ BasicBlock::iterator Iter = TileDef->getIterator();
+ IRBuilder<> Builder(BB, ++Iter);
+ Value *Stride = Builder.getInt64(64);
+ std::array<Value *, 5> Args = {Row, Col, Ptr, Stride, TileDef};
+
+ Instruction *TileStore =
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ return TileStore;
+}
+
+static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
+ Value *V = U.get();
+ assert(V->getType()->isX86_AMXTy() && "Not define tile!");
+
+ // Get tile shape.
+ IntrinsicInst *II = nullptr;
+ if (IsPHI) {
+ Value *PhiOp = dyn_cast<PHINode>(V)->getIncomingValue(0);
+ II = cast<IntrinsicInst>(PhiOp);
+ } else {
+ II = cast<IntrinsicInst>(V);
+ }
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+
+ Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+ IRBuilder<> Builder(UserI);
+ Value *Stride = Builder.getInt64(64);
+ std::array<Value *, 4> Args = {Row, Col, Ptr, Stride};
+
+ Value *TileLoad =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ UserI->replaceUsesOfWith(V, TileLoad);
+}
+
+static bool isIncomingOfPHI(Instruction *I) {
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ if (isa<PHINode>(V))
+ return true;
+ }
+ return false;
+}
+
+// Let all AMX tile data become volatile data, shorten the life range
+// of each tile register before fast register allocation.
+namespace {
+class X86VolatileTileData {
+ Function &F;
+
+public:
+ X86VolatileTileData(Function &Func) : F(Func) {}
+ Value *updatePhiIncomings(BasicBlock *BB,
+ SmallVector<Instruction *, 2> &Imcomings);
+ void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr);
+ bool volatileTileData();
+ void volatileTilePHI(PHINode *Inst);
+ void volatileTileNonPHI(Instruction *I);
+};
+
+Value *X86VolatileTileData::updatePhiIncomings(
+ BasicBlock *BB, SmallVector<Instruction *, 2> &Imcomings) {
+ Value *I8Ptr = getAllocaPos(BB);
+
+ for (auto *I : Imcomings) {
+ User *Store = createTileStore(I, I8Ptr);
+
+ // All its uses (except phi) should load from stored mem.
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ if (isa<PHINode>(V) || V == Store)
+ continue;
+ replaceWithTileLoad(U, I8Ptr);
+ }
+ }
+ return I8Ptr;
+}
+
+void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI,
+ Value *StorePtr) {
+ for (Use &U : PHI->uses())
+ replaceWithTileLoad(U, StorePtr, true);
+ PHI->eraseFromParent();
+}
+
+// Smilar with volatileTileNonPHI, this function only handle PHI Nodes
+// and their related AMX intrinsics.
+// 1) PHI Def should change to tileload.
+// 2) PHI Incoming Values should tilestored in just after their def.
+// 3) The mem of these tileload and tilestores should be same.
+// e.g.
+// ------------------------------------------------------
+// bb_dom:
+// ...
+// br i1 %bool.cond, label %if.else, label %if.then
+//
+// if.then:
+// def %t0 = ...
+// ...
+// use %t0
+// ...
+// br label %if.end
+//
+// if.else:
+// def %t1 = ...
+// br label %if.end
+//
+// if.end:
+// %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ]
+// ...
+// use %td
+// ------------------------------------------------------
+// -->
+// ------------------------------------------------------
+// bb_entry:
+// %mem = alloca <256 x i32>, align 1024 *
+// ...
+// bb_dom:
+// ...
+// br i1 %bool.cond, label %if.else, label %if.then
+//
+// if.then:
+// def %t0 = ...
+// call void @llvm.x86.tilestored64.internal(mem, %t0) *
+// ...
+// %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)*
+// use %t0` *
+// ...
+// br label %if.end
+//
+// if.else:
+// def %t1 = ...
+// call void @llvm.x86.tilestored64.internal(mem, %t1) *
+// br label %if.end
+//
+// if.end:
+// ...
+// %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) *
+// use %td
+// ------------------------------------------------------
+void X86VolatileTileData::volatileTilePHI(PHINode *PHI) {
+ BasicBlock *BB = PHI->getParent();
+ SmallVector<Instruction *, 2> Imcomings;
+
+ for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+ Value *Op = PHI->getIncomingValue(I);
+ Instruction *Inst = dyn_cast<Instruction>(Op);
+ assert(Inst && "We shouldn't fold AMX instrution!");
+ Imcomings.push_back(Inst);
+ }
+
+ Value *StorePtr = updatePhiIncomings(BB, Imcomings);
+ replacePhiDefWithLoad(PHI, StorePtr);
+}
+
+// Store the defined tile and load it before use.
+// All its users are not PHI.
+// e.g.
+// ------------------------------------------------------
+// def %td = ...
+// ...
+// "use %td"
+// ------------------------------------------------------
+// -->
+// ------------------------------------------------------
+// def %td = ...
+// call void @llvm.x86.tilestored64.internal(mem, %td)
+// ...
+// %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)
+// "use %td2"
+// ------------------------------------------------------
+void X86VolatileTileData::volatileTileNonPHI(Instruction *I) {
+ BasicBlock *BB = I->getParent();
+ Value *I8Ptr = getAllocaPos(BB);
+ User *Store = createTileStore(I, I8Ptr);
+
+ // All its uses should load from stored mem.
+ for (Use &U : I->uses()) {
+ User *V = U.getUser();
+ assert(!isa<PHINode>(V) && "PHI Nodes should be excluded!");
+ if (V != Store)
+ replaceWithTileLoad(U, I8Ptr);
+ }
+}
+
+// Volatile Tile Model:
+// 1) All the uses of tile data comes from tileload in time.
+// 2) All the defs of tile data tilestore into mem immediately.
+// For example:
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+// 3) No terminator, call or other amx instructions in the key amx area.
+bool X86VolatileTileData::volatileTileData() {
+ bool Changed = false;
+ for (BasicBlock &BB : F) {
+ SmallVector<Instruction *, 2> PHIInsts;
+ SmallVector<Instruction *, 8> AMXDefInsts;
+
+ for (Instruction &I : BB) {
+ if (!I.getType()->isX86_AMXTy())
+ continue;
+ if (isa<PHINode>(&I))
+ PHIInsts.push_back(&I);
+ else
+ AMXDefInsts.push_back(&I);
+ }
+
+ // First we "volatile" the non-phi related amx intrinsics.
+ for (Instruction *I : AMXDefInsts) {
+ if (isIncomingOfPHI(I))
+ continue;
+ volatileTileNonPHI(I);
+ Changed = true;
+ }
+
+ for (Instruction *I : PHIInsts) {
+ volatileTilePHI(dyn_cast<PHINode>(I));
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+} // anonymous namespace
+
namespace {
class X86LowerAMXTypeLegacyPass : public FunctionPass {
@@ -334,11 +646,24 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass {
bool runOnFunction(Function &F) override {
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
- if (F.hasFnAttribute(Attribute::OptimizeNone) ||
- TM->getOptLevel() == CodeGenOpt::None)
- return false;
- X86LowerAMXType LAT(F);
+
+ X86LowerAMXType LAT(F, TM);
bool C = LAT.visit();
+
+ // Prepare for fast register allocation at O0.
+ // Todo: May better check the volatile model of AMX code, not just
+ // by checking Attribute::OptimizeNone and CodeGenOpt::None.
+ if (TM->getOptLevel() == CodeGenOpt::None) {
+ // If Front End not use O0 but the Mid/Back end use O0, (e.g.
+ // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
+ // sure the amx data is volatile, that is nessary for AMX fast
+ // register allocation.
+ if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
+ X86VolatileTileData VTD(F);
+ C = VTD.volatileTileData() || C;
+ }
+ }
+
return C;
}
diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
new file mode 100644
index 000000000000..fad5c73bc92d
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -0,0 +1,422 @@
+//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Insert tilecfg for each area of key AMX intrinsic.
+/// All the key AMX intrinsic's tile operand must come from tileload. And the
+/// def tile of key AMX intrinsic must be tilestored.
+/// take tdpbssd for example:
+/// --------------------------------------------------------------------------
+/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key
+/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) |
+/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx
+/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) |
+/// call void @llvm.x86.tilestored64.internal(... td) area
+/// --------------------------------------------------------------------------
+/// This pass will insert tilecfg before every key-amx-area, some like:
+/// --------------------------------------------------------------------------
+/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
+/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
+/// ...
+/// ... pre-config shape of %t1 *
+/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+/// ... *
+/// ... pre-config shape of %t2 * shapes
+/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 *
+/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+/// ...
+/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "pre-amx-config"
+
+static bool isAMXIntrinsic(IntrinsicInst *II) {
+ for (Value *Operand : II->operands())
+ if (Operand->getType()->isX86_AMXTy())
+ return true;
+ return II->getType()->isX86_AMXTy();
+}
+
+static bool isTileLoad(IntrinsicInst *II) {
+ return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
+}
+
+static bool isTileStore(IntrinsicInst *II) {
+ return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal;
+}
+
+#ifndef NDEBUG
+static bool onlyTileDef(IntrinsicInst *II) {
+ for (Value *Operand : II->operands())
+ if (Operand->getType()->isX86_AMXTy())
+ return false;
+ return II->getType()->isX86_AMXTy();
+}
+
+static bool brokenVolatile(Instruction *I) {
+ // Todo: it is weak to identify a normal call here.
+ if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) || I->isTerminator())
+ return true;
+ return false;
+}
+#endif
+
+namespace {
+class X86PreAMXConfig {
+ Function &F;
+
+public:
+ X86PreAMXConfig(Function &Func) : F(Func) {}
+ bool preTileConfig();
+ bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
+ bool findConfigShapes(
+ DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
+ bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
+ bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+ SmallVector<Value *, 8> &Shapes);
+ BasicBlock::iterator
+ getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
+ SmallVector<Value *, 8> &Shapes);
+ bool checkVolatileModel(SmallSet<Value *, 4> &Loads, IntrinsicInst *Store,
+ IntrinsicInst *KeyAMX);
+};
+
+// Orderly write the shapes in tilecfg's mem. This maybe not right.
+// Because the first shape may not corresponding to the first tmm register,
+// so we need to handle at at X86FastTileConfig::materializeTileCfg()
+// after register allocation.
+// For example:
+// --------------------------------------------------------------------------
+// zeroinitialize tilecfg's mem (of ldtilecfg)
+// --------------------------------------------------------------------------
+// ... pre-config shape of %t1 *
+// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 *
+// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 *
+// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+// ... *
+// ... pre-config shape of %t2 *
+// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 *
+// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 *
+// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
+// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+// ... *
+// ... pre-config shape of %t3 * of
+// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 *
+// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 *
+// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
+// ... * tiles
+// ... pre-config shape of %td *
+// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 *
+// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 *
+// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
+// --------------------------------------------------------------------------
+// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+ SmallVector<Value *, 8> &Shapes) {
+ bool Write = false;
+ LLVMContext &Ctx = Pos->getParent()->getContext();
+ Type *I8Ty = Type::getInt8Ty(Ctx);
+ Type *I16Ty = Type::getInt16Ty(Ctx);
+
+ // TODO: Currently we defaultly set Palette = 1, it may be assigned to
+ // other value in the future.
+ Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
+ Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
+ Value *PalettePos =
+ GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
+ new StoreInst(PaletteValue, PalettePos, Pos);
+
+ for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
+ Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
+ Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
+ const std::string ShapeName = "amx.tmm." + itostr(I);
+ Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
+ ShapeName + ".shape.row", Pos);
+ Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
+ ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
+ ShapeName + ".shape.col", Pos);
+ Value *Row = Shapes[I * 2];
+ Value *Col = Shapes[I * 2 + 1];
+ Row = new TruncInst(Row, I8Ty, "", Pos);
+ new StoreInst(Row, RowPos, Pos);
+ new StoreInst(Col, ColPos, Pos);
+ Write = true;
+ }
+ return Write;
+}
+
+bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
+ SmallVector<Value *, 8> &Shapes) {
+ Module *M = F.getParent();
+ IRBuilder<> Builder(ModelStart);
+ const DataLayout &DL = M->getDataLayout();
+ unsigned AddrSpace = DL.getAllocaAddrSpace();
+ LLVMContext &Ctx = Builder.getContext();
+ Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false);
+ Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx));
+
+ AllocaInst *Addr =
+ new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front());
+ Addr->setAlignment(Alignment);
+ Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
+
+ std::array<Value *, 1> Args = {I8Ptr};
+ Instruction *Cfg =
+ Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);
+
+ Value *Val0 = Constant::getNullValue(V512Ty);
+ Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
+ assert(Init0 && "Not Zero initilizate the cfg mem!");
+
+ preWriteTileCfg(I8Ptr, Cfg, Shapes);
+
+ return Init0;
+}
+
+// Todo: We may need to handle "more than one store" case in the future.
+bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads,
+ IntrinsicInst *Store,
+ IntrinsicInst *KeyAMX) {
+ Value *ST = Store->getOperand(4);
+
+ // Only has tileload and tilestore.
+ if (!KeyAMX)
+ return (Loads.size() == 1) && Loads.contains(ST);
+
+ // All Loads should be operands of KeyAMX.
+ // All tile operands of KeyAMX should come from Loads.
+ for (Value *Op : KeyAMX->operands()) {
+ if (Op->getType()->isX86_AMXTy())
+ if (!Loads.erase(Op))
+ return false;
+ }
+
+ // The def of KeyAMX should be stored into mem.
+ // Todo: is it key amx can be no def?
+ return Loads.empty() && (ST == cast<Value>(KeyAMX));
+}
+
+bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX,
+ SmallVector<Value *, 8> &Shapes) {
+ for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) {
+ Value *Op = KeyAMX->getOperand(I);
+ if (!Op->getType()->isX86_AMXTy())
+ continue;
+ IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op);
+ assert((TileDef && isTileLoad(TileDef)) &&
+ "All KeyAMX's tile definiation should comes from TileLoad!");
+ Shapes.push_back(TileDef->getOperand(0));
+ Shapes.push_back(TileDef->getOperand(1));
+ }
+ if (!isTileStore(KeyAMX)) {
+ Shapes.push_back(KeyAMX->getOperand(0));
+ Shapes.push_back(KeyAMX->getOperand(1));
+ }
+ return Shapes.size() != 0;
+}
+
+// Collect the shapes and skip the area of current key amx intrinsic.
+//
+// For example:
+// ...
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k)
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k)
+// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k)
+// --------------------------------------------------------------------------
+BasicBlock::iterator
+X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
+ SmallVector<Value *, 8> &Shapes) {
+ IntrinsicInst *KeyAMX = nullptr;
+ BasicBlock *BB = Iter->getParent();
+ BasicBlock::iterator PosEnd = BB->end();
+ SmallSet<Value *, 4> Loads;
+
+ // See TileStore as "Config Position End" and check volatile model.
+ for (auto I = Iter, E = BB->end(); I != E; ++I) {
+ assert(!brokenVolatile(&*I) && "Not reach tile store!");
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
+ if (!II || !isAMXIntrinsic(II))
+ continue;
+
+ if (isTileLoad(II)) {
+ Loads.insert(II);
+ } else if (isTileStore(II)) {
+ if (!checkVolatileModel(Loads, II, KeyAMX))
+ report_fatal_error("Not Volatile AMX Model!");
+ PosEnd = I;
+ break;
+ } else {
+ assert(!KeyAMX && "Too many key amx intrinsic!");
+ KeyAMX = II;
+ }
+ }
+ assert(PosEnd != BB->end() && "Not find TileStore!");
+
+ // See KeyAMX as TileStore if only TileLoad and TileStore.
+ if (!KeyAMX)
+ KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd);
+
+ // Get Shapes in order.
+ assert(Shapes.empty() && "Shapes should be clean.");
+ getKeyAMXShapes(KeyAMX, Shapes);
+
+ return PosEnd;
+}
+
+// Record a key amx area's shapes with its position.
+// Use the first tileload as its position.
+// For example:
+// ...
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) /
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes:
+// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n)
+// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n)
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::findConfigShapes(
+ DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
+ bool Find = false;
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
+ if (!II)
+ continue;
+ if (!isAMXIntrinsic(II))
+ continue;
+ assert(onlyTileDef(II) && "Not volatile model for AMX at O0!");
+
+ I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]);
+ Find = true;
+ }
+ }
+ return Find;
+}
+
+// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic.
+// e.g. (key amx = tdpbssd)
+// --------------------------------------------------------------------------
+// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
+// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
+// ...
+// ... pre-config shape of %t1 *
+// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
+// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
+// ... *
+// ... pre-config shape of %t2 *
+// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
+// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
+// ... *
+// ... pre-config shape of %t3 * of
+// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
+// ... * tiles
+// ... pre-config shape of %td *
+// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
+// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
+//
+// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config
+// --------------------------------------------------------------------------
+// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
+// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
+// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
+// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
+// call void @llvm.x86.tilestored64.internal(... td) area
+// --------------------------------------------------------------------------
+bool X86PreAMXConfig::preTileConfig() {
+ DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
+ bool NeedCfg = findConfigShapes(PosAndShapes);
+ if (!NeedCfg)
+ return false;
+ for (auto &IPAndShapes : PosAndShapes)
+ addTileConfig(IPAndShapes.first, IPAndShapes.second);
+
+ return true;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86PreAMXConfigPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86PreAMXConfigPass() : FunctionPass(ID) {
+ initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+ bool C = false;
+
+ // Prepare for fast register allocation at O0.
+ if (TM->getOptLevel() == CodeGenOpt::None) {
+
+ // We pre-config each key AMX intrinsic at O0.
+ // In theory, one tile config can cover several AMX intrinsics, but
+ // it is very
diff cult to classify the tile shapes at O0. So here we
+ // let thing be easy, pre-config every key AMX intrinsic.
+ X86PreAMXConfig PCFG(F);
+ C = PCFG.preTileConfig();
+ }
+
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Pre AMX Tile Config";
+char X86PreAMXConfigPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
+
+FunctionPass *llvm::createX86PreAMXConfigPass() {
+ return new X86PreAMXConfigPass();
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index ff99186609e9..084376dc254a 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -64,6 +64,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializeX86LowerAMXIntrinsicsLegacyPassPass(PR);
initializeX86LowerAMXTypeLegacyPassPass(PR);
+ initializeX86PreAMXConfigPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -74,6 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86TileConfigPass(PR);
+ initializeX86FastTileConfigPass(PR);
initializeX86LowerTileCopyPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
@@ -377,6 +379,7 @@ class X86PassConfig : public TargetPassConfig {
bool addPreISel() override;
void addMachineSSAOptimization() override;
void addPreRegAlloc() override;
+ bool addPostFastRegAllocRewrite() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
void addPreEmitPass2() override;
@@ -416,6 +419,9 @@ void X86PassConfig::addIRPasses() {
addPass(createX86LowerAMXIntrinsicsPass());
addPass(createX86LowerAMXTypePass());
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ addPass(createX86PreAMXConfigPass());
+
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOpt::None) {
@@ -583,6 +589,11 @@ void X86PassConfig::addPreEmitPass2() {
addPass(createX86LoadValueInjectionRetHardeningPass());
}
+bool X86PassConfig::addPostFastRegAllocRewrite() {
+ addPass(createX86FastTileConfigPass());
+ return true;
+}
+
bool X86PassConfig::addPreRewrite() {
addPass(createX86TileConfigPass());
return true;
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
new file mode 100644
index 000000000000..f7089e98fcfe
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
@@ -0,0 +1,4559 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
+
+
+source_filename = "amx_api.c"
+
+%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
+; AVX512-LABEL: test_api:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: .cfi_def_cfa_register %rbp
+; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400
+; AVX512-NEXT: movw %dx, %ax
+; AVX512-NEXT: movw %si, %cx
+; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset at PLT
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset at PLT
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset at PLT
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %di
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rdi)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: jmp .LBB0_3
+; AVX512-NEXT: .LBB0_2: # %if.else
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %di
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rdi)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: .LBB0_3: # %if.end
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: movl $1024, %edx # imm = 0x400
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: # kill: def $r8 killed $rax
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX512-NEXT: movw %r10w, %di
+; AVX512-NEXT: shrl $2, %r10d
+; AVX512-NEXT: movw %r10w, %r9w
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; AVX512-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2
+; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: addq $64, %rdi
+; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy at PLT
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def $rdi killed $rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r9b
+; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%r8)
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa %rsp, 8
+; AVX512-NEXT: tilerelease
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_api:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX2-NEXT: subq $29696, %rsp # imm = 0x7400
+; AVX2-NEXT: movw %dx, %ax
+; AVX2-NEXT: movw %si, %cx
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset at PLT
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset at PLT
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset at PLT
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %di
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rdi)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: jmp .LBB0_3
+; AVX2-NEXT: .LBB0_2: # %if.else
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %di
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rdi)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: .LBB0_3: # %if.end
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vmovaps 64(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 96(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 128(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 160(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 192(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 224(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 256(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 288(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 320(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 352(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 384(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 416(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 448(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 480(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 512(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 544(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 576(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 608(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 640(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 672(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 704(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 736(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 768(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 800(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 832(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 864(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 896(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 928(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 960(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 992(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 1024(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 1056(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movl $1024, %edx # imm = 0x400
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: # kill: def $r8 killed $rax
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movw %r10w, %di
+; AVX2-NEXT: shrl $2, %r10d
+; AVX2-NEXT: movw %r10w, %r9w
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; AVX2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm0
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm1
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm2
+; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: addq $64, %rdi
+; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy at PLT
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: # kill: def $rdi killed $rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r9b
+; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%r8)
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: tileloadd (%rdi,%r8), %tmm0
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
+; AVX2-NEXT: tilerelease
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; SSE2-LABEL: test_api:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: .cfi_def_cfa_offset 16
+; SSE2-NEXT: .cfi_offset %rbp, -16
+; SSE2-NEXT: movq %rsp, %rbp
+; SSE2-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; SSE2-NEXT: subq $30720, %rsp # imm = 0x7800
+; SSE2-NEXT: movw %dx, %ax
+; SSE2-NEXT: movw %si, %cx
+; SSE2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset at PLT
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset at PLT
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset at PLT
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: je .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %di
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rdi)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: jmp .LBB0_3
+; SSE2-NEXT: .LBB0_2: # %if.else
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %di
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rdi)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: .LBB0_3: # %if.end
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movaps 64(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 80(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 96(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 112(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 128(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 144(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 160(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 176(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 192(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 208(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 224(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 240(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 256(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 272(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 288(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 304(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 320(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 336(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 352(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 368(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 384(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 400(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 416(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 432(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 448(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 464(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 480(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 496(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 512(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 528(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 544(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 560(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 576(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 592(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 608(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 624(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 640(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 656(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 672(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 688(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 704(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 720(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 736(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 752(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 768(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 784(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 800(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 816(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 832(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 848(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 864(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 880(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 896(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 912(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 928(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 944(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 960(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 976(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 992(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1008(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1024(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1040(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1056(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1072(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movl $1024, %edx # imm = 0x400
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: # kill: def $r8 killed $rax
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: movw %r10w, %di
+; SSE2-NEXT: shrl $2, %r10d
+; SSE2-NEXT: movw %r10w, %r9w
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; SSE2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm0
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm1
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm2
+; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: addq $64, %rdi
+; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy at PLT
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: # kill: def $rdi killed $rax
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r9b
+; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%r8)
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: tileloadd (%rdi,%r8), %tmm0
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: movq %rbp, %rsp
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: .cfi_def_cfa %rsp, 8
+; SSE2-NEXT: tilerelease
+; SSE2-NEXT: retq
+entry:
+ %m.addr.i85 = alloca i16, align 2
+ %n.addr.i86 = alloca i16, align 2
+ %base.addr.i87 = alloca i8*, align 8
+ %stride.addr.i88 = alloca i64, align 8
+ %tile.addr.i = alloca <256 x i32>, align 64
+ %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024
+ %m.addr.i81 = alloca i16, align 2
+ %n.addr.i82 = alloca i16, align 2
+ %k.addr.i = alloca i16, align 2
+ %dst.addr.i83 = alloca <256 x i32>, align 64
+ %src1.addr.i = alloca <256 x i32>, align 64
+ %src2.addr.i = alloca <256 x i32>, align 64
+ %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024
+ %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024
+ %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024
+ %m.addr.i74 = alloca i16, align 2
+ %n.addr.i75 = alloca i16, align 2
+ %base.addr.i76 = alloca i8*, align 8
+ %stride.addr.i77 = alloca i64, align 8
+ %m.addr.i70 = alloca i16, align 2
+ %n.addr.i71 = alloca i16, align 2
+ %base.addr.i72 = alloca i8*, align 8
+ %stride.addr.i73 = alloca i64, align 8
+ %m.addr.i66 = alloca i16, align 2
+ %n.addr.i67 = alloca i16, align 2
+ %base.addr.i68 = alloca i8*, align 8
+ %stride.addr.i69 = alloca i64, align 8
+ %m.addr.i62 = alloca i16, align 2
+ %n.addr.i63 = alloca i16, align 2
+ %base.addr.i64 = alloca i8*, align 8
+ %stride.addr.i65 = alloca i64, align 8
+ %m.addr.i58 = alloca i16, align 2
+ %n.addr.i59 = alloca i16, align 2
+ %base.addr.i60 = alloca i8*, align 8
+ %stride.addr.i61 = alloca i64, align 8
+ %m.addr.i = alloca i16, align 2
+ %n.addr.i = alloca i16, align 2
+ %base.addr.i56 = alloca i8*, align 8
+ %stride.addr.i57 = alloca i64, align 8
+ %base.addr.i50 = alloca i8*, align 8
+ %stride.addr.i51 = alloca i64, align 8
+ %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024
+ %c49 = alloca %struct.__tile1024i_str, align 64
+ %dst.addr.i44 = alloca %struct.__tile1024i_str*, align 8
+ %indirect-arg-temp.i = alloca <256 x i32>, align 1024
+ %indirect-arg-temp4.i = alloca <256 x i32>, align 1024
+ %indirect-arg-temp5.i = alloca <256 x i32>, align 1024
+ %b43 = alloca %struct.__tile1024i_str, align 64
+ %a42 = alloca %struct.__tile1024i_str, align 64
+ %dst.addr.i35 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i36 = alloca i8*, align 8
+ %stride.addr.i37 = alloca i64, align 8
+ %dst.addr.i28 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i29 = alloca i8*, align 8
+ %stride.addr.i30 = alloca i64, align 8
+ %dst.addr.i21 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i22 = alloca i8*, align 8
+ %stride.addr.i23 = alloca i64, align 8
+ %dst.addr.i14 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i15 = alloca i8*, align 8
+ %stride.addr.i16 = alloca i64, align 8
+ %dst.addr.i7 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i8 = alloca i8*, align 8
+ %stride.addr.i9 = alloca i64, align 8
+ %dst.addr.i = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i = alloca i8*, align 8
+ %stride.addr.i = alloca i64, align 8
+ %cond.addr = alloca i32, align 4
+ %row.addr = alloca i16, align 2
+ %col.addr = alloca i16, align 2
+ %a = alloca %struct.__tile1024i_str, align 64
+ %b = alloca %struct.__tile1024i_str, align 64
+ %c = alloca %struct.__tile1024i_str, align 64
+ store i32 %cond, i32* %cond.addr, align 4
+ store i16 %row, i16* %row.addr, align 2
+ store i16 %col, i16* %col.addr, align 2
+ %0 = bitcast %struct.__tile1024i_str* %a to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %0, i8 0, i64 1088, i1 false)
+ %row1 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 0
+ %1 = load i16, i16* %row.addr, align 2
+ store i16 %1, i16* %row1, align 64
+ %col2 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 1
+ store i16 8, i16* %col2, align 2
+ %2 = bitcast %struct.__tile1024i_str* %b to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %2, i8 0, i64 1088, i1 false)
+ %row3 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 0
+ store i16 8, i16* %row3, align 64
+ %col4 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 1
+ %3 = load i16, i16* %col.addr, align 2
+ store i16 %3, i16* %col4, align 2
+ %4 = bitcast %struct.__tile1024i_str* %c to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %4, i8 0, i64 1088, i1 false)
+ %row5 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 0
+ %5 = load i16, i16* %row.addr, align 2
+ store i16 %5, i16* %row5, align 64
+ %col6 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 1
+ %6 = load i16, i16* %col.addr, align 2
+ store i16 %6, i16* %col6, align 2
+ %7 = load i32, i32* %cond.addr, align 4
+ %tobool = icmp ne i32 %7, 0
+ br i1 %tobool, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i36, align 8
+ store i64 32, i64* %stride.addr.i37, align 8
+ %8 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %row.i38 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %8, i32 0, i32 0
+ %9 = load i16, i16* %row.i38, align 64
+ %10 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %col.i39 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %10, i32 0, i32 1
+ %11 = load i16, i16* %col.i39, align 2
+ %12 = load i8*, i8** %base.addr.i36, align 8
+ %13 = load i64, i64* %stride.addr.i37, align 8
+ store i16 %9, i16* %m.addr.i, align 2
+ store i16 %11, i16* %n.addr.i, align 2
+ store i8* %12, i8** %base.addr.i56, align 8
+ store i64 %13, i64* %stride.addr.i57, align 8
+ %14 = load i16, i16* %m.addr.i, align 2
+ %15 = load i16, i16* %n.addr.i, align 2
+ %16 = load i8*, i8** %base.addr.i56, align 8
+ %17 = load i64, i64* %stride.addr.i57, align 8
+ %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %14, i16 %15, i8* %16, i64 %17) #2
+ %19 = bitcast x86_amx %18 to <256 x i32>
+ %20 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %20, i32 0, i32 3
+ store <256 x i32> %19, <256 x i32>* %tile.i41, align 64
+ store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i29, align 8
+ store i64 32, i64* %stride.addr.i30, align 8
+ %21 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %row.i31 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %21, i32 0, i32 0
+ %22 = load i16, i16* %row.i31, align 64
+ %23 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %col.i32 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %23, i32 0, i32 1
+ %24 = load i16, i16* %col.i32, align 2
+ %25 = load i8*, i8** %base.addr.i29, align 8
+ %26 = load i64, i64* %stride.addr.i30, align 8
+ store i16 %22, i16* %m.addr.i58, align 2
+ store i16 %24, i16* %n.addr.i59, align 2
+ store i8* %25, i8** %base.addr.i60, align 8
+ store i64 %26, i64* %stride.addr.i61, align 8
+ %27 = load i16, i16* %m.addr.i58, align 2
+ %28 = load i16, i16* %n.addr.i59, align 2
+ %29 = load i8*, i8** %base.addr.i60, align 8
+ %30 = load i64, i64* %stride.addr.i61, align 8
+ %31 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %27, i16 %28, i8* %29, i64 %30) #2
+ %32 = bitcast x86_amx %31 to <256 x i32>
+ %33 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %33, i32 0, i32 3
+ store <256 x i32> %32, <256 x i32>* %tile.i34, align 64
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i22, align 8
+ store i64 32, i64* %stride.addr.i23, align 8
+ %34 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %row.i24 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %34, i32 0, i32 0
+ %35 = load i16, i16* %row.i24, align 64
+ %36 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %col.i25 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %36, i32 0, i32 1
+ %37 = load i16, i16* %col.i25, align 2
+ %38 = load i8*, i8** %base.addr.i22, align 8
+ %39 = load i64, i64* %stride.addr.i23, align 8
+ store i16 %35, i16* %m.addr.i62, align 2
+ store i16 %37, i16* %n.addr.i63, align 2
+ store i8* %38, i8** %base.addr.i64, align 8
+ store i64 %39, i64* %stride.addr.i65, align 8
+ %40 = load i16, i16* %m.addr.i62, align 2
+ %41 = load i16, i16* %n.addr.i63, align 2
+ %42 = load i8*, i8** %base.addr.i64, align 8
+ %43 = load i64, i64* %stride.addr.i65, align 8
+ %44 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %40, i16 %41, i8* %42, i64 %43) #2
+ %45 = bitcast x86_amx %44 to <256 x i32>
+ %46 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %46, i32 0, i32 3
+ store <256 x i32> %45, <256 x i32>* %tile.i27, align 64
+ br label %if.end
+
+if.else: ; preds = %entry
+ store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i15, align 8
+ store i64 32, i64* %stride.addr.i16, align 8
+ %47 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %row.i17 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %47, i32 0, i32 0
+ %48 = load i16, i16* %row.i17, align 64
+ %49 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %col.i18 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %49, i32 0, i32 1
+ %50 = load i16, i16* %col.i18, align 2
+ %51 = load i8*, i8** %base.addr.i15, align 8
+ %52 = load i64, i64* %stride.addr.i16, align 8
+ store i16 %48, i16* %m.addr.i66, align 2
+ store i16 %50, i16* %n.addr.i67, align 2
+ store i8* %51, i8** %base.addr.i68, align 8
+ store i64 %52, i64* %stride.addr.i69, align 8
+ %53 = load i16, i16* %m.addr.i66, align 2
+ %54 = load i16, i16* %n.addr.i67, align 2
+ %55 = load i8*, i8** %base.addr.i68, align 8
+ %56 = load i64, i64* %stride.addr.i69, align 8
+ %57 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %53, i16 %54, i8* %55, i64 %56) #2
+ %58 = bitcast x86_amx %57 to <256 x i32>
+ %59 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %59, i32 0, i32 3
+ store <256 x i32> %58, <256 x i32>* %tile.i20, align 64
+ store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i8, align 8
+ store i64 32, i64* %stride.addr.i9, align 8
+ %60 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %row.i10 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %60, i32 0, i32 0
+ %61 = load i16, i16* %row.i10, align 64
+ %62 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %col.i11 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %62, i32 0, i32 1
+ %63 = load i16, i16* %col.i11, align 2
+ %64 = load i8*, i8** %base.addr.i8, align 8
+ %65 = load i64, i64* %stride.addr.i9, align 8
+ store i16 %61, i16* %m.addr.i70, align 2
+ store i16 %63, i16* %n.addr.i71, align 2
+ store i8* %64, i8** %base.addr.i72, align 8
+ store i64 %65, i64* %stride.addr.i73, align 8
+ %66 = load i16, i16* %m.addr.i70, align 2
+ %67 = load i16, i16* %n.addr.i71, align 2
+ %68 = load i8*, i8** %base.addr.i72, align 8
+ %69 = load i64, i64* %stride.addr.i73, align 8
+ %70 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %66, i16 %67, i8* %68, i64 %69) #2
+ %71 = bitcast x86_amx %70 to <256 x i32>
+ %72 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %72, i32 0, i32 3
+ store <256 x i32> %71, <256 x i32>* %tile.i13, align 64
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i, align 8
+ store i64 32, i64* %stride.addr.i, align 8
+ %73 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %row.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %73, i32 0, i32 0
+ %74 = load i16, i16* %row.i, align 64
+ %75 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %col.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %75, i32 0, i32 1
+ %76 = load i16, i16* %col.i, align 2
+ %77 = load i8*, i8** %base.addr.i, align 8
+ %78 = load i64, i64* %stride.addr.i, align 8
+ store i16 %74, i16* %m.addr.i74, align 2
+ store i16 %76, i16* %n.addr.i75, align 2
+ store i8* %77, i8** %base.addr.i76, align 8
+ store i64 %78, i64* %stride.addr.i77, align 8
+ %79 = load i16, i16* %m.addr.i74, align 2
+ %80 = load i16, i16* %n.addr.i75, align 2
+ %81 = load i8*, i8** %base.addr.i76, align 8
+ %82 = load i64, i64* %stride.addr.i77, align 8
+ %83 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %79, i16 %80, i8* %81, i64 %82) #2
+ %84 = bitcast x86_amx %83 to <256 x i32>
+ %85 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %tile.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %85, i32 0, i32 3
+ store <256 x i32> %84, <256 x i32>* %tile.i, align 64
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %86 = bitcast %struct.__tile1024i_str* %b43 to i8*
+ %87 = bitcast %struct.__tile1024i_str* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %86, i8* align 1 %87, i64 1088, i1 false) #2
+ %88 = bitcast %struct.__tile1024i_str* %a42 to i8*
+ %89 = bitcast %struct.__tile1024i_str* %a to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %88, i8* align 1 %89, i64 1088, i1 false) #2
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %row.i45 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 0
+ %90 = load i16, i16* %row.i45, align 64
+ %col.i46 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 1
+ %91 = load i16, i16* %col.i46, align 2
+ %col1.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 1
+ %92 = load i16, i16* %col1.i, align 2
+ %93 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %93, i32 0, i32 3
+ %94 = load <256 x i32>, <256 x i32>* %tile.i47, align 64
+ %tile2.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 3
+ %95 = load <256 x i32>, <256 x i32>* %tile2.i, align 64
+ %tile3.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 3
+ %96 = load <256 x i32>, <256 x i32>* %tile3.i, align 64
+ store <256 x i32> %94, <256 x i32>* %indirect-arg-temp.i, align 1024
+ store <256 x i32> %95, <256 x i32>* %indirect-arg-temp4.i, align 1024
+ store <256 x i32> %96, <256 x i32>* %indirect-arg-temp5.i, align 1024
+ %97 = bitcast <256 x i32>* %indirect-arg-temp5.i80 to i8*
+ %98 = bitcast <256 x i32>* %indirect-arg-temp5.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %97, i8* align 1 %98, i64 1024, i1 false) #2
+ %99 = bitcast <256 x i32>* %indirect-arg-temp4.i79 to i8*
+ %100 = bitcast <256 x i32>* %indirect-arg-temp4.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %99, i8* align 1 %100, i64 1024, i1 false) #2
+ %101 = bitcast <256 x i32>* %indirect-arg-temp.i78 to i8*
+ %102 = bitcast <256 x i32>* %indirect-arg-temp.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %101, i8* align 1 %102, i64 1024, i1 false) #2
+ %dst.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i78, align 1024
+ %src1.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp4.i79, align 1024
+ %src2.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp5.i80, align 1024
+ store i16 %90, i16* %m.addr.i81, align 2
+ store i16 %91, i16* %n.addr.i82, align 2
+ store i16 %92, i16* %k.addr.i, align 2
+ store <256 x i32> %dst.i, <256 x i32>* %dst.addr.i83, align 64
+ store <256 x i32> %src1.i, <256 x i32>* %src1.addr.i, align 64
+ store <256 x i32> %src2.i, <256 x i32>* %src2.addr.i, align 64
+ %103 = load i16, i16* %m.addr.i81, align 2
+ %104 = load i16, i16* %n.addr.i82, align 2
+ %105 = load i16, i16* %k.addr.i, align 2
+ %106 = load <256 x i32>, <256 x i32>* %dst.addr.i83, align 64
+ %107 = bitcast <256 x i32> %106 to x86_amx
+ %108 = load <256 x i32>, <256 x i32>* %src1.addr.i, align 64
+ %109 = bitcast <256 x i32> %108 to x86_amx
+ %110 = load <256 x i32>, <256 x i32>* %src2.addr.i, align 64
+ %111 = bitcast <256 x i32> %110 to x86_amx
+ %112 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %103, i16 %104, i16 %105, x86_amx %107, x86_amx %109, x86_amx %111) #2
+ %113 = bitcast x86_amx %112 to <256 x i32>
+ %114 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %tile6.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %114, i32 0, i32 3
+ store <256 x i32> %113, <256 x i32>* %tile6.i, align 64
+ %115 = bitcast %struct.__tile1024i_str* %c49 to i8*
+ %116 = bitcast %struct.__tile1024i_str* %c to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %115, i8* align 1 %116, i64 1088, i1 false) #2
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i50, align 8
+ store i64 32, i64* %stride.addr.i51, align 8
+ %row.i53 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 0
+ %117 = load i16, i16* %row.i53, align 64
+ %col.i54 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 1
+ %118 = load i16, i16* %col.i54, align 2
+ %119 = load i8*, i8** %base.addr.i50, align 8
+ %120 = load i64, i64* %stride.addr.i51, align 8
+ %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 3
+ %121 = load <256 x i32>, <256 x i32>* %tile.i55, align 64
+ store <256 x i32> %121, <256 x i32>* %indirect-arg-temp.i52, align 1024
+ %122 = bitcast <256 x i32>* %indirect-arg-temp.i5284 to i8*
+ %123 = bitcast <256 x i32>* %indirect-arg-temp.i52 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %122, i8* align 1 %123, i64 1024, i1 false) #2
+ %tile.i89 = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i5284, align 1024
+ store i16 %117, i16* %m.addr.i85, align 2
+ store i16 %118, i16* %n.addr.i86, align 2
+ store i8* %119, i8** %base.addr.i87, align 8
+ store i64 %120, i64* %stride.addr.i88, align 8
+ store <256 x i32> %tile.i89, <256 x i32>* %tile.addr.i, align 64
+ %124 = load i16, i16* %m.addr.i85, align 2
+ %125 = load i16, i16* %n.addr.i86, align 2
+ %126 = load i8*, i8** %base.addr.i87, align 8
+ %127 = load i64, i64* %stride.addr.i88, align 8
+ %128 = load <256 x i32>, <256 x i32>* %tile.addr.i, align 64
+ %129 = bitcast <256 x i32> %128 to x86_amx
+ call void @llvm.x86.tilestored64.internal(i16 %124, i16 %125, i8* %126, i64 %127, x86_amx %129) #2
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
+
+; Function Attrs: nounwind
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
new file mode 100644
index 000000000000..9673b0469ffc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
+
+; CHECK-LABEL: entry:
+; CHECK: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
+; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
+; CHECK: if.then:
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: br label %if.end
+; CHECK: if.else:
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: br label %if.end
+; CHECK: if.end:
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: ret void
+
+entry:
+ %tobool.not = icmp eq i32 %cond, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
+ %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
+ %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
+ %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+
+; Function Attrs: nounwind
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll
new file mode 100644
index 000000000000..8bd5f94c36a0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -pre-amx-config -S | FileCheck %s
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
+; CHECK-LABEL: entry:
+; CHECK: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
+; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
+; CHECK: if.then:
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: br label %if.end
+; CHECK: if.else:
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: br label %if.end
+; CHECK: if.end: ; preds = %if.else, %if.then
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: %amx.tmm.1.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 49
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 18
+; CHECK-NEXT: %amx.tmm.1.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2
+; CHECK-NEXT: %amx.tmm.2.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 50
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 20
+; CHECK-NEXT: %amx.tmm.2.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2
+; CHECK-NEXT: %amx.tmm.3.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 51
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 22
+; CHECK-NEXT: %amx.tmm.3.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
+; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
+; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
+; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
+; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
+; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
+; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
+; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
+; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
+; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
+; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})
+; CHECK-NEXT: ret void
+entry:
+ %0 = alloca <256 x i32>, align 1024
+ %1 = bitcast <256 x i32>* %0 to i8*
+ %2 = alloca <256 x i32>, align 1024
+ %3 = bitcast <256 x i32>* %2 to i8*
+ %4 = alloca <256 x i32>, align 1024
+ %5 = bitcast <256 x i32>* %4 to i8*
+ %6 = alloca <256 x i32>, align 1024
+ %7 = bitcast <256 x i32>* %6 to i8*
+ %tobool.not = icmp eq i32 %cond, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8)
+ %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9)
+ %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11)
+ %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12)
+ %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64)
+ %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64)
+ %16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64)
+ %17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17)
+ %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64)
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+
+; Function Attrs: nounwind
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
new file mode 100644
index 000000000000..0771d93e1a68
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
@@ -0,0 +1,513 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
+; AVX512-LABEL: test_api:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: .cfi_def_cfa_register %rbp
+; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800
+; AVX512-NEXT: movw %dx, %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw %si, %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: cmpl $0, %edi
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %sil
+; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl $buf, %r9d
+; AVX512-NEXT: movl $32, %r10d
+; AVX512-NEXT: movw $8, %si
+; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: tilestored %tmm0, (%r11,%r8)
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rsi)
+; AVX512-NEXT: movl $buf, %esi
+; AVX512-NEXT: movl $32, %edi
+; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: jmp .LBB0_3
+; AVX512-NEXT: .LBB0_2: # %if.else
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %sil
+; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl $buf2, %r9d
+; AVX512-NEXT: movl $32, %r10d
+; AVX512-NEXT: movw $8, %si
+; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: tilestored %tmm0, (%r11,%r8)
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rsi)
+; AVX512-NEXT: movl $buf2, %esi
+; AVX512-NEXT: movl $32, %edi
+; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: .LBB0_3: # %if.end
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %sil
+; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: movw $8, %di
+; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1
+; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2
+; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0
+; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rsi)
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movl $buf, %edx
+; AVX512-NEXT: movl $32, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa %rsp, 8
+; AVX512-NEXT: tilerelease
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_api:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800
+; AVX2-NEXT: movw %dx, %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw %si, %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: cmpl $0, %edi
+; AVX2-NEXT: je .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %sil
+; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl $buf, %r9d
+; AVX2-NEXT: movl $32, %r10d
+; AVX2-NEXT: movw $8, %si
+; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rsi)
+; AVX2-NEXT: movl $buf, %esi
+; AVX2-NEXT: movl $32, %edi
+; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: jmp .LBB0_3
+; AVX2-NEXT: .LBB0_2: # %if.else
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %sil
+; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl $buf2, %r9d
+; AVX2-NEXT: movl $32, %r10d
+; AVX2-NEXT: movw $8, %si
+; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
+; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rsi)
+; AVX2-NEXT: movl $buf2, %esi
+; AVX2-NEXT: movl $32, %edi
+; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: .LBB0_3: # %if.end
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %sil
+; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: movw $8, %di
+; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1
+; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2
+; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0
+; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rsi)
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movl $buf, %edx
+; AVX2-NEXT: movl $32, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
+; AVX2-NEXT: tilerelease
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; SSE2-LABEL: test_api:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: .cfi_def_cfa_offset 16
+; SSE2-NEXT: .cfi_offset %rbp, -16
+; SSE2-NEXT: movq %rsp, %rbp
+; SSE2-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800
+; SSE2-NEXT: movw %dx, %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw %si, %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: cmpl $0, %edi
+; SSE2-NEXT: je .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %sil
+; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl $buf, %r9d
+; SSE2-NEXT: movl $32, %r10d
+; SSE2-NEXT: movw $8, %si
+; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
+; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rsi)
+; SSE2-NEXT: movl $buf, %esi
+; SSE2-NEXT: movl $32, %edi
+; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: jmp .LBB0_3
+; SSE2-NEXT: .LBB0_2: # %if.else
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %sil
+; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl $buf2, %r9d
+; SSE2-NEXT: movl $32, %r10d
+; SSE2-NEXT: movw $8, %si
+; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
+; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rsi)
+; SSE2-NEXT: movl $buf2, %esi
+; SSE2-NEXT: movl $32, %edi
+; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: .LBB0_3: # %if.end
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %sil
+; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: movw $8, %di
+; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1
+; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2
+; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0
+; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rsi)
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movl $buf, %edx
+; SSE2-NEXT: movl $32, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: movq %rbp, %rsp
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: .cfi_def_cfa %rsp, 8
+; SSE2-NEXT: tilerelease
+; SSE2-NEXT: retq
+entry:
+ %tobool.not = icmp eq i32 %cond, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
+ %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
+ %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
+ %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+
+; Function Attrs: nounwind
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir b/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir
new file mode 100644
index 000000000000..ebb0c7d501fc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir
@@ -0,0 +1,465 @@
+# RUN: llc -o - -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -run-pass=fasttileconfig %s | FileCheck %s
+
+--- |
+
+ @buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 {
+ entry:
+ %0 = alloca <16 x i32>, align 4
+ %1 = alloca <16 x i32>, align 4
+ %2 = alloca <16 x i32>, align 4
+ %3 = alloca <16 x i32>, align 4
+ %4 = alloca <16 x i32>, align 4
+ %5 = alloca <16 x i32>, align 4
+ %6 = alloca <16 x i32>, align 4
+ %7 = alloca <16 x i32>, align 4
+ %8 = alloca <256 x i32>, align 1024
+ %9 = bitcast <256 x i32>* %8 to i8*
+ %10 = alloca <256 x i32>, align 1024
+ %11 = bitcast <256 x i32>* %10 to i8*
+ %12 = alloca <256 x i32>, align 1024
+ %13 = bitcast <256 x i32>* %12 to i8*
+ %14 = alloca <256 x i32>, align 1024
+ %15 = bitcast <256 x i32>* %14 to i8*
+ %tobool.not = icmp eq i32 %cond, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+ if.then: ; preds = %entry
+ %16 = bitcast <16 x i32>* %6 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %6, align 64
+ %amx.tmm.0.shape.row1 = getelementptr i8, i8* %16, i64 48
+ %17 = getelementptr i8, i8* %16, i64 16
+ %amx.tmm.0.shape.col2 = bitcast i8* %17 to i16*
+ %18 = trunc i16 %row to i8
+ store volatile i8 %18, i8* %amx.tmm.0.shape.row1, align 1
+ store volatile i16 8, i16* %amx.tmm.0.shape.col2, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %16)
+ %19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %19)
+ %20 = bitcast <16 x i32>* %2 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %2, align 64
+ %amx.tmm.0.shape.row9 = getelementptr i8, i8* %20, i64 48
+ %21 = getelementptr i8, i8* %20, i64 16
+ %amx.tmm.0.shape.col10 = bitcast i8* %21 to i16*
+ %22 = trunc i16 8 to i8
+ store volatile i8 %22, i8* %amx.tmm.0.shape.row9, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col10, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %20)
+ %23 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %23)
+ %24 = bitcast <16 x i32>* %3 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %3, align 64
+ %amx.tmm.0.shape.row7 = getelementptr i8, i8* %24, i64 48
+ %25 = getelementptr i8, i8* %24, i64 16
+ %amx.tmm.0.shape.col8 = bitcast i8* %25 to i16*
+ %26 = trunc i16 %row to i8
+ store volatile i8 %26, i8* %amx.tmm.0.shape.row7, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col8, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %24)
+ %27 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %27)
+ br label %if.end
+
+ if.else: ; preds = %entry
+ %28 = bitcast <16 x i32>* %1 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %1, align 64
+ %amx.tmm.0.shape.row11 = getelementptr i8, i8* %28, i64 48
+ %29 = getelementptr i8, i8* %28, i64 16
+ %amx.tmm.0.shape.col12 = bitcast i8* %29 to i16*
+ %30 = trunc i16 %row to i8
+ store volatile i8 %30, i8* %amx.tmm.0.shape.row11, align 1
+ store volatile i16 8, i16* %amx.tmm.0.shape.col12, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %28)
+ %31 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %31)
+ %32 = bitcast <16 x i32>* %7 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %7, align 64
+ %amx.tmm.0.shape.row = getelementptr i8, i8* %32, i64 48
+ %33 = getelementptr i8, i8* %32, i64 16
+ %amx.tmm.0.shape.col = bitcast i8* %33 to i16*
+ %34 = trunc i16 8 to i8
+ store volatile i8 %34, i8* %amx.tmm.0.shape.row, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %32)
+ %35 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %35)
+ %36 = bitcast <16 x i32>* %0 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %0, align 64
+ %amx.tmm.0.shape.row13 = getelementptr i8, i8* %36, i64 48
+ %37 = getelementptr i8, i8* %36, i64 16
+ %amx.tmm.0.shape.col14 = bitcast i8* %37 to i16*
+ %38 = trunc i16 %row to i8
+ store volatile i8 %38, i8* %amx.tmm.0.shape.row13, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col14, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %36)
+ %39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %39)
+ br label %if.end
+
+ if.end: ; preds = %if.else, %if.then
+ %40 = bitcast <16 x i32>* %4 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %4, align 64
+ %amx.tmm.0.shape.row5 = getelementptr i8, i8* %40, i64 48
+ %41 = getelementptr i8, i8* %40, i64 16
+ %amx.tmm.0.shape.col6 = bitcast i8* %41 to i16*
+ %42 = trunc i16 %row to i8
+ store volatile i8 %42, i8* %amx.tmm.0.shape.row5, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col6, align 2
+ %amx.tmm.1.shape.row = getelementptr i8, i8* %40, i64 49
+ %43 = getelementptr i8, i8* %40, i64 18
+ %amx.tmm.1.shape.col = bitcast i8* %43 to i16*
+ %44 = trunc i16 %row to i8
+ store volatile i8 %44, i8* %amx.tmm.1.shape.row, align 1
+ store volatile i16 8, i16* %amx.tmm.1.shape.col, align 2
+ %amx.tmm.2.shape.row = getelementptr i8, i8* %40, i64 50
+ %45 = getelementptr i8, i8* %40, i64 20
+ %amx.tmm.2.shape.col = bitcast i8* %45 to i16*
+ %46 = trunc i16 8 to i8
+ store volatile i8 %46, i8* %amx.tmm.2.shape.row, align 1
+ store volatile i16 %col, i16* %amx.tmm.2.shape.col, align 2
+ %amx.tmm.3.shape.row = getelementptr i8, i8* %40, i64 51
+ %47 = getelementptr i8, i8* %40, i64 22
+ %amx.tmm.3.shape.col = bitcast i8* %47 to i16*
+ %48 = trunc i16 %row to i8
+ store volatile i8 %48, i8* %amx.tmm.3.shape.row, align 1
+ store volatile i16 %col, i16* %amx.tmm.3.shape.col, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %40)
+ %49 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %13, i64 64)
+ %50 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %11, i64 64)
+ %51 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %9, i64 64)
+ %52 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %51, x86_amx %49, x86_amx %50)
+ call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %15, i64 64, x86_amx %52)
+ %53 = bitcast <16 x i32>* %5 to i8*
+ store <16 x i32> zeroinitializer, <16 x i32>* %5, align 64
+ %amx.tmm.0.shape.row3 = getelementptr i8, i8* %53, i64 48
+ %54 = getelementptr i8, i8* %53, i64 16
+ %amx.tmm.0.shape.col4 = bitcast i8* %54 to i16*
+ %55 = trunc i16 %row to i8
+ store volatile i8 %55, i8* %amx.tmm.0.shape.row3, align 1
+ store volatile i16 %col, i16* %amx.tmm.0.shape.col4, align 2
+ call void @llvm.x86.ldtilecfg.internal(i8* %53)
+ %56 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %15, i64 64)
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %56)
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1
+
+ ; Function Attrs: nounwind
+ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1
+
+ ; Function Attrs: nounwind
+ declare void @llvm.x86.ldtilecfg.internal(i8*) #2
+
+ attributes #0 = { "target-features"="+amx-int8,+avx512f" }
+ attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" }
+ attributes #2 = { nounwind }
+
+...
+---
+name: test_api
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$edi', virtual-reg: '' }
+ - { reg: '$esi', virtual-reg: '' }
+ - { reg: '$edx', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1024
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 2, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 3, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 5, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 6, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 7, name: '', type: default, offset: 0, size: 64, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 8, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 9, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 10, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 11, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 12, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 13, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 14, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 15, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 16, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 17, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ liveins: $edi, $esi, $edx
+
+ renamable $ax = COPY renamable $dx, implicit killed $edx
+ MOV16mr %stack.17, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.17)
+ renamable $ax = COPY renamable $si, implicit killed $esi
+ MOV16mr %stack.16, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.16)
+ renamable $rax = LEA64r %stack.8, 1, $noreg, 0, $noreg
+ MOV64mr %stack.15, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.15)
+ renamable $rax = LEA64r %stack.9, 1, $noreg, 0, $noreg
+ MOV64mr %stack.14, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.14)
+ renamable $rax = LEA64r %stack.10, 1, $noreg, 0, $noreg
+ MOV64mr %stack.13, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.13)
+ renamable $rax = LEA64r %stack.11, 1, $noreg, 0, $noreg
+ MOV64mr %stack.12, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.12)
+ CMP32ri8 killed renamable $edi, 0, implicit-def $eflags
+ JCC_1 %bb.2, 4, implicit killed $eflags
+
+ bb.1.if.then:
+ successors: %bb.3(0x80000000)
+ ; CHECK-LABEL: bb.1.if.then
+ ; tmm0 --> row_offset = 48, col_offset = 16
+ ; CHECK: MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1)
+ ; CHECK: MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2)
+ ; CHECK: PLDTILECFGV %stack.6, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
+ ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
+
+ ; tmm1 --> row_offset = 49, col_offset = 18
+ ; CHECK: MOV8mi %stack.2, 1, $noreg, 49, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9)
+ ; CHECK: MOV16mr %stack.2, 1, $noreg, 18, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10)
+ ; CHECK: PLDTILECFGV %stack.2, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
+ ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1
+
+ ; tmm2 --> row_offset = 50, col_offset = 20
+ ; CHECK: MOV8mr %stack.3, 1, $noreg, 50, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7)
+ ; CHECK: MOV16mr %stack.3, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8)
+ ; CHECK: PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
+ ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2
+
+ $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
+ $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
+ $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
+ $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
+ $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
+ renamable $zmm0 = AVX512_512_SET0
+ VMOVDQA64Zmr %stack.6, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.6)
+ renamable $sil = COPY renamable $al
+ MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1)
+ MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2)
+ PLDTILECFGV %stack.6, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $r9 = MOV32ri64 @buf
+ renamable $r10 = MOV32ri64 32
+ renamable $si = MOV16ri 8
+ renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
+ renamable $r8 = MOV32ri64 64
+ PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
+ VMOVDQA64Zmr %stack.2, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.2)
+ MOV8mi %stack.2, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9)
+ MOV16mr %stack.2, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10)
+ PLDTILECFGV %stack.2, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
+ PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1
+ renamable $rsi = LEA64r %stack.3, 1, $noreg, 0, $noreg
+ VMOVDQA64Zmr %stack.3, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.3)
+ renamable $dil = COPY renamable $al
+ MOV8mr %stack.3, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7)
+ MOV16mr %stack.3, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8)
+ PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $rsi = MOV32ri64 @buf
+ renamable $rdi = MOV32ri64 32
+ renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
+ renamable $rsi = MOV32ri64 64
+ PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2
+ JMP_1 %bb.3
+
+ bb.2.if.else:
+ successors: %bb.3(0x80000000)
+
+ ; CHECK-LABEL: bb.2.if.else
+ ; tmm3 --> row_offset = 51, col_offset = 22
+ ; CHECK: MOV8mr %stack.1, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11)
+ ; CHECK: MOV16mi %stack.1, 1, $noreg, 22, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12)
+ ; CHECK: PLDTILECFGV %stack.1, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
+ ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3
+
+ ; tmm4 --> row_offset = 52, col_offset = 24
+ ; CHECK: MOV8mi %stack.7, 1, $noreg, 52, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row)
+ ; CHECK: MOV16mr %stack.7, 1, $noreg, 24, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col)
+ ; CHECK: PLDTILECFGV %stack.7, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
+ ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4
+
+ ; tmm4 --> row_offset = 53, col_offset = 26
+ ; CHECK: MOV8mr %stack.0, 1, $noreg, 53, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13)
+ ; CHECK: MOV16mr %stack.0, 1, $noreg, 26, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14)
+ ; CHECK: PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
+ ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5
+
+ $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
+ $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
+ $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
+ $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
+ $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
+ renamable $zmm0 = AVX512_512_SET0
+ VMOVDQA64Zmr %stack.1, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.1)
+ renamable $sil = COPY renamable $al
+ MOV8mr %stack.1, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11)
+ MOV16mi %stack.1, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12)
+ PLDTILECFGV %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $r9 = MOV32ri64 @buf2
+ renamable $r10 = MOV32ri64 32
+ renamable $si = MOV16ri 8
+ renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
+ renamable $r8 = MOV32ri64 64
+ PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3
+ VMOVDQA64Zmr %stack.7, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.7)
+ MOV8mi %stack.7, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row)
+ MOV16mr %stack.7, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col)
+ PLDTILECFGV %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
+ PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4
+ renamable $rsi = LEA64r %stack.0, 1, $noreg, 0, $noreg
+ VMOVDQA64Zmr %stack.0, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.0)
+ renamable $dil = COPY renamable $al
+ MOV8mr %stack.0, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13)
+ MOV16mr %stack.0, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14)
+ PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $rsi = MOV32ri64 @buf2
+ renamable $rdi = MOV32ri64 32
+ renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
+ renamable $rsi = MOV32ri64 64
+ PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5
+
+ bb.3.if.end:
+ ; CHECK-LABEL: bb.3.if.end
+ ; tmm0 --> row_offset = 48, col_offset = 16
+ ; tmm1 --> row_offset = 49, col_offset = 18
+ ; tmm2 --> row_offset = 50, col_offset = 20
+ ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5)
+ ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6)
+ ; CHECK: MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row)
+ ; CHECK: MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col)
+ ; CHECK: MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row)
+ ; CHECK: MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col)
+ ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row)
+ ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col)
+ ; CHECK: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0
+ ; CHECK: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg
+ ; CHECK: renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
+ ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg
+ ; CHECK: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
+ ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+
+ ; tmm6 --> row_offset = 54, col_offset = 28
+ ; CHECK: MOV8mr %stack.5, 1, $noreg, 54, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3)
+ ; CHECK: MOV16mr %stack.5, 1, $noreg, 28, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4)
+ ; CHECK: PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg
+ ; CHECK: renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+ ; CHECK: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6
+
+ $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
+ $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
+ $rdx = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load 8 from %stack.12)
+ $r8 = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
+ $r9 = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
+ $r10 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
+ renamable $zmm0 = AVX512_512_SET0
+ VMOVDQA64Zmr %stack.4, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.4)
+ renamable $sil = COPY renamable $al
+ MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5)
+ MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6)
+ MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row)
+ MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col)
+ MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row)
+ MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col)
+ MOV8mr %stack.4, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row)
+ MOV16mr %stack.4, 1, $noreg, 22, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col)
+ PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $rsi = MOV32ri64 64
+ renamable $di = MOV16ri 8
+ renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg
+ renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
+ renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg
+ renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
+ PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+ renamable $rsi = LEA64r %stack.5, 1, $noreg, 0, $noreg
+ VMOVDQA64Zmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.5)
+ renamable $dil = COPY renamable $al
+ MOV8mr %stack.5, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3)
+ MOV16mr %stack.5, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4)
+ PLDTILECFGV killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
+ renamable $rsi = MOV32ri64 64
+ renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+ renamable $rdx = MOV32ri64 @buf
+ renamable $rsi = MOV32ri64 32
+ PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6
+ RETQ
+
+...
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 1145ff75ba6a..2046d84b2d96 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_no_bitcast(i32* %A_mem, i32* %B_mem, i32* %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 9b05356bab23..708c7abe8565 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index d68133277cce..559739a5e00e 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -20,6 +20,7 @@
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
+; CHECK-NEXT: Pre AMX Tile Config
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
@@ -46,6 +47,7 @@
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
; CHECK-NEXT: Fast Register Allocator
+; CHECK-NEXT: Fast Tile Register Configure
; CHECK-NEXT: X86 Lower Tile Copy
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 254fd6abe66d..ee0b05d77020 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -522,8 +522,8 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
"expand-reductions", "indirectbr-expand",
"generic-to-nvvm", "expandmemcmp",
"loop-reduce", "lower-amx-type",
- "lower-amx-intrinsics", "polyhedral-info",
- "replace-with-veclib"};
+ "pre-amx-config", "lower-amx-intrinsics",
+ "polyhedral-info", "replace-with-veclib"};
for (const auto &P : PassNamePrefix)
if (Pass.startswith(P))
return true;
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index 0c848f354f9d..be4b8ce8360a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -87,6 +87,7 @@ static_library("LLVMX86CodeGen") {
"X86EvexToVex.cpp",
"X86ExpandPseudo.cpp",
"X86FastISel.cpp",
+ "X86FastTileConfig.cpp",
"X86FixupBWInsts.cpp",
"X86FixupLEAs.cpp",
"X86FixupSetCC.cpp",
@@ -110,6 +111,7 @@ static_library("LLVMX86CodeGen") {
"X86LoadValueInjectionRetHardening.cpp",
"X86LowerAMXIntrinsics.cpp",
"X86LowerAMXType.cpp",
+ "X86PreAMXConfig.cpp",
"X86LowerTileCopy.cpp",
"X86MCInstLower.cpp",
"X86MachineFunctionInfo.cpp",
More information about the cfe-commits
mailing list