[llvm] cc88724 - [CodeGen] Ensure callers of CreateStackTemporary use sensible alignments
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 9 00:10:44 PDT 2020
Author: David Sherwood
Date: 2020-06-09T08:10:17+01:00
New Revision: cc8872400c3c0960e2673ea67ab6bcfcc9b3966b
URL: https://github.com/llvm/llvm-project/commit/cc8872400c3c0960e2673ea67ab6bcfcc9b3966b
DIFF: https://github.com/llvm/llvm-project/commit/cc8872400c3c0960e2673ea67ab6bcfcc9b3966b.diff
LOG: [CodeGen] Ensure callers of CreateStackTemporary use sensible alignments
In two instances of CreateStackTemporary we are sometimes promoting
alignments beyond the stack alignment. I have introduced a new function
called getReducedAlign that will return the alignment for the broken
down parts of illegal vector types. For example, on NEON a <32 x i8>
type is made up of two <16 x i8> types - in this case the sensible
alignment is 16 bytes, not 32.
In the legalization code wherever we create stack temporaries I have
started using the reduced alignments instead for illegal vector types.
I added a test to
CodeGen/AArch64/build-one-lane.ll
that tries to insert an element into an illegal fixed vector type
that involves creating a temporary stack object.
Differential Revision: https://reviews.llvm.org/D80370
Added:
Modified:
llvm/include/llvm/CodeGen/SelectionDAG.h
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/build-one-lane.ll
llvm/test/CodeGen/AMDGPU/scratch-simple.ll
llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/extractelement-index.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/i64-mem-copy.ll
llvm/test/CodeGen/X86/insertelement-var-index.ll
llvm/test/CodeGen/X86/pr31088.ll
llvm/test/CodeGen/X86/var-permute-128.ll
llvm/test/CodeGen/X86/vec_fneg.ll
llvm/test/CodeGen/X86/vec_insert-4.ll
llvm/test/CodeGen/X86/vector-extend-inreg.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4e452c2941fb..f9706ee9f4e4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1608,6 +1608,12 @@ class SelectionDAG {
void dump() const;
+ /// In most cases this function returns the ABI alignment for a given type,
+ /// except for illegal vector types where the alignment exceeds that of the
+ /// stack. In such cases we attempt to break the vector down to a legal type
+ /// and return the ABI alignment for that instead.
+ Align getReducedAlign(EVT VT, bool UseABI);
+
/// Create a stack temporary based on the size in bytes and the alignment
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 9885110d64f9..2e1377c2c173 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -889,12 +889,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
SDLoc dl(Op);
// Create the stack frame object. Make sure it is aligned for both
// the source and destination types.
- SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false);
+ Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false);
+ Align Align = std::max(DestAlign, OpAlign);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align);
// Emit a store to the stack slot.
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+ MachinePointerInfo(), Align);
// Result is a load from the stack slot.
- return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
+ return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
/// Replace the node's results with custom code provided by the target and
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 666f128a4cc2..9cd3b8f76d6c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -156,9 +156,13 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
// Create the stack frame object. Make sure it is aligned for both
// the source and expanded destination types.
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(
- NOutVT.getTypeForEVT(*DAG.getContext()));
- SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment.value());
+
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false);
+ Align Align = std::max(InAlign, NOutAlign);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
@@ -167,7 +171,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
// Load the first half from the stack slot.
- Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign);
// Increment the pointer to the other half.
unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
@@ -175,7 +179,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
// Load the second half from the stack slot.
Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), NOutAlign);
// Handle endianness of the load.
if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 9ebf4ea9637c..297b8aa3e848 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1158,15 +1158,17 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
}
// Spill the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Store the new subvector into the specified index.
SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -1174,7 +1176,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
MachinePointerInfo::getUnknownStack(MF));
// Load the Lo part from the stack slot.
- Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
+ SmallestAlign);
// Increment the pointer to the other part.
unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
@@ -1182,7 +1185,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
}
void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
@@ -1454,27 +1457,30 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
}
// Spill the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Store the new element. This may be larger than the vector element type,
// so use a truncating store.
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
- Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
- MachinePointerInfo::getUnknownStack(MF), EltVT);
+ Store = DAG.getTruncStore(
+ Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
+ commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
// Load the Lo part from the stack slot.
- Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
// Increment the pointer to the other part.
unsigned IncrementSize = LoVT.getSizeInBits() / 8;
@@ -1482,7 +1488,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
// If we adjusted the original type, we need to truncate the results.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
@@ -2223,11 +2229,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
}
// Store the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Load back the required element.
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -2242,7 +2253,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
return DAG.getExtLoad(
ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
+ commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
}
SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 44c2b01ae976..80f03c422284 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1993,6 +1993,34 @@ SDValue SelectionDAG::expandVACopy(SDNode *Node) {
MachinePointerInfo(VD));
}
+Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
+ const DataLayout &DL = getDataLayout();
+ Type *Ty = VT.getTypeForEVT(*getContext());
+ Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+
+ if (TLI->isTypeLegal(VT) || !VT.isVector())
+ return RedAlign;
+
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+ const Align StackAlign = TFI->getStackAlign();
+
+ // See if we can choose a smaller ABI alignment in cases where it's an
+ // illegal vector type that will get broken down.
+ if (RedAlign > StackAlign) {
+ EVT IntermediateVT;
+ MVT RegisterVT;
+ unsigned NumIntermediates;
+ unsigned NumRegs = TLI->getVectorTypeBreakdown(
+ *getContext(), VT, IntermediateVT, NumIntermediates, RegisterVT);
+ Ty = IntermediateVT.getTypeForEVT(*getContext());
+ Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+ if (RedAlign2 < RedAlign)
+ RedAlign = RedAlign2;
+ }
+
+ return RedAlign;
+}
+
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
MachineFrameInfo &MFI = MF->getFrameInfo();
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
diff --git a/llvm/test/CodeGen/AArch64/build-one-lane.ll b/llvm/test/CodeGen/AArch64/build-one-lane.ll
index 55225975c515..78dfaa9d1769 100644
--- a/llvm/test/CodeGen/AArch64/build-one-lane.ll
+++ b/llvm/test/CodeGen/AArch64/build-one-lane.ll
@@ -270,3 +270,15 @@ define void @v2f64st(<2 x double>* %p, double %s) nounwind {
; CHECK: mov v[[R]].d[1], v{{[0-9]+}}.d[0]
; CHECK: str q[[R]], [x{{[0-9]+}}]
}
+
+; In this test the illegal type has a preferred alignment greater than the
+; stack alignment, that gets reduced to the alignment of a broken down
+; legal type.
+define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_32xi8
+; CHECK: stp q0, q1, [sp, #-32]!
+; CHECK: ldp q0, q1, [sp], #32
+ %b = insertelement <32 x i8> %a, i8 30, i32 %x
+ ret <32 x i8> %b
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 0b2eb6a7ae17..2c852f066c4f 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -26,8 +26,8 @@
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; GCN-NOT: s_mov_b32 s0
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index d9327368ac82..f42df585df2a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -25,7 +25,7 @@
; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1536
+; GCN: ScratchSize: 768
define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
bb:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index d2062f21762c..f6ffd6419c13 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1768,8 +1768,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $256, %rsp ## imm = 0x100
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
; KNL-NEXT: movl 744(%rbp), %eax
; KNL-NEXT: andl $127, %eax
; KNL-NEXT: vmovd %edi, %xmm0
@@ -1939,8 +1939,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-128, %rsp
-; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $192, %rsp
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
@@ -2076,8 +2076,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $256, %rsp ## imm = 0x100
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
@@ -2153,8 +2153,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-128, %rsp
-; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $192, %rsp
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll
index a95c4daf3b09..cf06f8dcb13e 100644
--- a/llvm/test/CodeGen/X86/extractelement-index.ll
+++ b/llvm/test/CodeGen/X86/extractelement-index.ll
@@ -443,16 +443,10 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v32i8_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $31, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movb (%rsp,%rdi), %al
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movb -40(%rsp,%rdi), %al
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v32i8_var:
@@ -493,16 +487,10 @@ define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $15, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i16_var:
@@ -543,16 +531,10 @@ define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i32_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $7, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movl (%rsp,%rdi,4), %eax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -40(%rsp,%rdi,4), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i32_var:
@@ -593,16 +575,10 @@ define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i64_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movq (%rsp,%rdi,8), %rax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -40(%rsp,%rdi,8), %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i64_var:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 82d1fbe2e0dd..1fabce24cc25 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -382,16 +382,16 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: subq $88, %rsp
; CHECK-LIBCALL-NEXT: movl (%rdi), %eax
; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx
-; CHECK-LIBCALL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT: movl %eax, (%rsp)
; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
-; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0
; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -400,11 +400,11 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0]
diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll
index 4d3e3623ad37..0190e91216ce 100644
--- a/llvm/test/CodeGen/X86/i64-mem-copy.ll
+++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll
@@ -109,34 +109,28 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
; X64-LABEL: PR23476:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: andq $-64, %rsp
-; X64-NEXT: subq $128, %rsp
; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: movq %rdi, %xmm1
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: movq %rdx, %xmm2
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-NEXT: movl 16(%rbp), %eax
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: andl $7, %eax
; X64-NEXT: movq %r8, %xmm0
-; X64-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp)
-; X64-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp)
-; X64-NEXT: movdqa %xmm1, (%rsp)
-; X64-NEXT: movq (%rsp,%rax,8), %rax
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -72(%rsp,%rax,8), %rax
; X64-NEXT: movq %rax, (%r9)
-; X64-NEXT: movq %rbp, %rsp
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X32-LABEL: PR23476:
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $128, %esp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $80, %esp
; X32-NEXT: movl 52(%ebp), %eax
; X32-NEXT: andl $7, %eax
; X32-NEXT: movl 48(%ebp), %ecx
@@ -156,8 +150,8 @@ define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
; X32AVX: # %bb.0:
; X32AVX-NEXT: pushl %ebp
; X32AVX-NEXT: movl %esp, %ebp
-; X32AVX-NEXT: andl $-64, %esp
-; X32AVX-NEXT: subl $128, %esp
+; X32AVX-NEXT: andl $-32, %esp
+; X32AVX-NEXT: subl $96, %esp
; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32AVX-NEXT: movl 52(%ebp), %eax
; X32AVX-NEXT: andl $7, %eax
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index 564c789c9880..2e3824c8f03f 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -262,17 +262,11 @@ define <2 x double> @load_f64_v2f64(double* %p, i32 %y) nounwind {
define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i8_v32i8:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $31, %esi
-; SSE-NEXT: movb %dil, (%rsp,%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movb %dil, -40(%rsp,%rsi)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i8_v32i8:
@@ -295,17 +289,11 @@ define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i16_v16i16:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $15, %esi
-; SSE-NEXT: movw %di, (%rsp,%rsi,2)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movw %di, -40(%rsp,%rsi,2)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i16_v16i16:
@@ -328,17 +316,11 @@ define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i32_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movl %edi, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movl %edi, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i32_v8i32:
@@ -360,17 +342,11 @@ define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i64_v4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movq %rdi, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movq %rdi, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i64_v4i64:
@@ -392,17 +368,11 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
; SSE-LABEL: arg_f32_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: andl $7, %edi
-; SSE-NEXT: movss %xmm0, (%rsp,%rdi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movss %xmm0, -40(%rsp,%rdi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_f32_v8f32:
@@ -422,17 +392,11 @@ define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
; SSE-LABEL: arg_f64_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movsd %xmm0, (%rsp,%rdi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movsd %xmm0, -40(%rsp,%rdi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_f64_v4f64:
@@ -452,18 +416,12 @@ define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
; SSE-LABEL: load_i8_v32i8:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movb (%rdi), %al
; SSE-NEXT: andl $31, %esi
-; SSE-NEXT: movb %al, (%rsp,%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movb %al, -40(%rsp,%rsi)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_v32i8:
@@ -487,18 +445,12 @@ define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
; SSE-LABEL: load_i16_v16i16:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: andl $15, %esi
-; SSE-NEXT: movw %ax, (%rsp,%rsi,2)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movw %ax, -40(%rsp,%rsi,2)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i16_v16i16:
@@ -522,18 +474,12 @@ define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
; SSE-LABEL: load_i32_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movl (%rdi), %eax
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movl %eax, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movl %eax, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_v8i32:
@@ -548,18 +494,12 @@ define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
; SSE-LABEL: load_i64_v4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movq (%rdi), %rax
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movq %rax, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movq %rax, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_i64_v4i64:
@@ -574,18 +514,12 @@ define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
; SSE-LABEL: load_f32_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movss %xmm0, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movss %xmm0, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_f32_v8f32:
@@ -600,18 +534,12 @@ define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
define <4 x double> @load_f64_v4f64(double* %p, i32 %y) nounwind {
; SSE-LABEL: load_f64_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movsd %xmm0, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movsd %xmm0, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_f64_v4f64:
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index f5c1b3bc2351..3d4cf50fcf07 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -66,7 +66,7 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
+; X86-NEXT: subl $64, %esp
; X86-NEXT: movzwl 8(%ebp), %esi
; X86-NEXT: movzwl 12(%ebp), %edi
; X86-NEXT: movzwl 20(%ebp), %ebx
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 9767f8624572..8bc971e79f50 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -643,116 +643,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
-; SSE3-NEXT: movq %rsp, %rbp
; SSE3-NEXT: pushq %r15
; SSE3-NEXT: pushq %r14
; SSE3-NEXT: pushq %r13
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: pushq %rbx
-; SSE3-NEXT: andq $-32, %rsp
-; SSE3-NEXT: subq $608, %rsp # imm = 0x260
-; SSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8
+; SSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
-; SSE3-NEXT: andl $31, %r9d
-; SSE3-NEXT: movzbl 64(%rsp,%r9), %ebx
-; SSE3-NEXT: movd %ebx, %xmm8
-; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 96(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm15
+; SSE3-NEXT: movaps %xmm0, (%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: andl $31, %r8d
+; SSE3-NEXT: movzbl -96(%rsp,%r8), %esi
+; SSE3-NEXT: movd %esi, %xmm8
+; SSE3-NEXT: andl $31, %ebp
+; SSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
+; SSE3-NEXT: movd %esi, %xmm15
; SSE3-NEXT: andl $31, %edx
-; SSE3-NEXT: movzbl 128(%rsp,%rdx), %eax
-; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
+; SSE3-NEXT: movd %edx, %xmm9
; SSE3-NEXT: andl $31, %ecx
-; SSE3-NEXT: movzbl 160(%rsp,%rcx), %eax
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: andl $31, %esi
-; SSE3-NEXT: movzbl 192(%rsp,%rsi), %eax
+; SSE3-NEXT: movzbl (%rsp,%rcx), %ecx
+; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: andl $31, %eax
+; SSE3-NEXT: movzbl 32(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
; SSE3-NEXT: andl $31, %edi
-; SSE3-NEXT: movzbl 224(%rsp,%rdi), %eax
+; SSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: andl $31, %r8d
-; SSE3-NEXT: movzbl 256(%rsp,%r8), %eax
+; SSE3-NEXT: andl $31, %ebx
+; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: andl $31, %r10d
-; SSE3-NEXT: movzbl 288(%rsp,%r10), %eax
+; SSE3-NEXT: andl $31, %r9d
+; SSE3-NEXT: movzbl 128(%rsp,%r9), %eax
; SSE3-NEXT: movd %eax, %xmm6
; SSE3-NEXT: andl $31, %r13d
-; SSE3-NEXT: movzbl 320(%rsp,%r13), %eax
+; SSE3-NEXT: movzbl 160(%rsp,%r13), %eax
; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: andl $31, %r12d
-; SSE3-NEXT: movzbl 352(%rsp,%r12), %eax
+; SSE3-NEXT: movzbl 192(%rsp,%r12), %eax
; SSE3-NEXT: movd %eax, %xmm5
; SSE3-NEXT: andl $31, %r15d
-; SSE3-NEXT: movzbl 384(%rsp,%r15), %eax
+; SSE3-NEXT: movzbl 224(%rsp,%r15), %eax
; SSE3-NEXT: movd %eax, %xmm13
; SSE3-NEXT: andl $31, %r14d
-; SSE3-NEXT: movzbl 416(%rsp,%r14), %eax
+; SSE3-NEXT: movzbl 256(%rsp,%r14), %eax
; SSE3-NEXT: movd %eax, %xmm4
; SSE3-NEXT: andl $31, %r11d
-; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax
+; SSE3-NEXT: movzbl 288(%rsp,%r11), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSE3-NEXT: andl $31, %r10d
+; SSE3-NEXT: movzbl 320(%rsp,%r10), %eax
; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax
+; SSE3-NEXT: movzbl 352(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax
+; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -769,7 +765,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE3-NEXT: leaq -40(%rbp), %rsp
+; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
@@ -781,116 +777,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
-; SSSE3-NEXT: movq %rsp, %rbp
; SSSE3-NEXT: pushq %r15
; SSSE3-NEXT: pushq %r14
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: andq $-32, %rsp
-; SSSE3-NEXT: subq $608, %rsp # imm = 0x260
-; SSSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8
+; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: andl $31, %r9d
-; SSSE3-NEXT: movzbl 64(%rsp,%r9), %ebx
-; SSSE3-NEXT: movd %ebx, %xmm8
-; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 96(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movaps %xmm0, (%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT: andl $31, %r8d
+; SSSE3-NEXT: movzbl -96(%rsp,%r8), %esi
+; SSSE3-NEXT: movd %esi, %xmm8
+; SSSE3-NEXT: andl $31, %ebp
+; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
+; SSSE3-NEXT: movd %esi, %xmm15
; SSSE3-NEXT: andl $31, %edx
-; SSSE3-NEXT: movzbl 128(%rsp,%rdx), %eax
-; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
+; SSSE3-NEXT: movd %edx, %xmm9
; SSSE3-NEXT: andl $31, %ecx
-; SSSE3-NEXT: movzbl 160(%rsp,%rcx), %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: andl $31, %esi
-; SSSE3-NEXT: movzbl 192(%rsp,%rsi), %eax
+; SSSE3-NEXT: movzbl (%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: andl $31, %eax
+; SSSE3-NEXT: movzbl 32(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm10
; SSSE3-NEXT: andl $31, %edi
-; SSSE3-NEXT: movzbl 224(%rsp,%rdi), %eax
+; SSSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm7
-; SSSE3-NEXT: andl $31, %r8d
-; SSSE3-NEXT: movzbl 256(%rsp,%r8), %eax
+; SSSE3-NEXT: andl $31, %ebx
+; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: andl $31, %r10d
-; SSSE3-NEXT: movzbl 288(%rsp,%r10), %eax
+; SSSE3-NEXT: andl $31, %r9d
+; SSSE3-NEXT: movzbl 128(%rsp,%r9), %eax
; SSSE3-NEXT: movd %eax, %xmm6
; SSSE3-NEXT: andl $31, %r13d
-; SSSE3-NEXT: movzbl 320(%rsp,%r13), %eax
+; SSSE3-NEXT: movzbl 160(%rsp,%r13), %eax
; SSSE3-NEXT: movd %eax, %xmm12
; SSSE3-NEXT: andl $31, %r12d
-; SSSE3-NEXT: movzbl 352(%rsp,%r12), %eax
+; SSSE3-NEXT: movzbl 192(%rsp,%r12), %eax
; SSSE3-NEXT: movd %eax, %xmm5
; SSSE3-NEXT: andl $31, %r15d
-; SSSE3-NEXT: movzbl 384(%rsp,%r15), %eax
+; SSSE3-NEXT: movzbl 224(%rsp,%r15), %eax
; SSSE3-NEXT: movd %eax, %xmm13
; SSSE3-NEXT: andl $31, %r14d
-; SSSE3-NEXT: movzbl 416(%rsp,%r14), %eax
+; SSSE3-NEXT: movzbl 256(%rsp,%r14), %eax
; SSSE3-NEXT: movd %eax, %xmm4
; SSSE3-NEXT: andl $31, %r11d
-; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax
+; SSSE3-NEXT: movzbl 288(%rsp,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSSE3-NEXT: andl $31, %r10d
+; SSSE3-NEXT: movzbl 320(%rsp,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax
+; SSSE3-NEXT: movzbl 352(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax
+; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -907,7 +899,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSSE3-NEXT: leaq -40(%rbp), %rsp
+; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -918,10 +910,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
;
; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pushq %rbp
-; SSE41-NEXT: movq %rsp, %rbp
-; SSE41-NEXT: andq $-32, %rsp
-; SSE41-NEXT: subq $544, %rsp # imm = 0x220
+; SSE41-NEXT: subq $392, %rsp # imm = 0x188
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
@@ -947,64 +936,63 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, (%rsp)
-; SSE41-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pextrb $1, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $1, 448(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $2, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $2, 416(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $2, 288(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $3, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $3, 384(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $3, 256(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $4, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $4, 352(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $4, 224(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $5, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $5, 320(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $5, 192(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $6, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $6, 288(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $6, 160(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $7, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $7, 256(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $7, 128(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $8, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $8, 224(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $8, 96(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $9, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $9, 192(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $9, 64(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $10, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $10, 160(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $10, 32(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $11, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $11, 128(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $11, (%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $12, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $12, 96(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $12, -32(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $13, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $13, 64(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $13, -64(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $14, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $14, 32(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $14, -96(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $15, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $15, (%rsp,%rax), %xmm0
-; SSE41-NEXT: movq %rbp, %rsp
-; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: pinsrb $15, -128(%rsp,%rax), %xmm0
+; SSE41-NEXT: addq $392, %rsp # imm = 0x188
; SSE41-NEXT: retq
;
; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll
index c3c1932c2311..3794bd2ce94b 100644
--- a/llvm/test/CodeGen/X86/vec_fneg.ll
+++ b/llvm/test/CodeGen/X86/vec_fneg.ll
@@ -121,7 +121,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X32-SSE1-NEXT: pushl %ebp
; X32-SSE1-NEXT: movl %esp, %ebp
; X32-SSE1-NEXT: andl $-16, %esp
-; X32-SSE1-NEXT: subl $32, %esp
+; X32-SSE1-NEXT: subl $16, %esp
; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; X32-SSE1-NEXT: movl 12(%ebp), %ecx
; X32-SSE1-NEXT: xorl %eax, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_insert-4.ll b/llvm/test/CodeGen/X86/vec_insert-4.ll
index 2c34b3b7d7a1..ed8833b95b2b 100644
--- a/llvm/test/CodeGen/X86/vec_insert-4.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-4.ll
@@ -5,36 +5,26 @@
define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X32-LABEL: f:
; X32: ## %bb.0: ## %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-32, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: subl $44, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $7, %eax
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
; X32-NEXT: movaps (%esp), %xmm0
; X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
+; X32-NEXT: addl $44, %esp
; X32-NEXT: retl
;
; X64-LABEL: f:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: pushq %rbp
-; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: andq $-32, %rsp
-; X64-NEXT: subq $64, %rsp
; X64-NEXT: ## kill: def $edi killed $edi def $rdi
-; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $7, %edi
-; X64-NEXT: movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
-; X64-NEXT: movaps (%rsp), %xmm0
-; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; X64-NEXT: movq %rbp, %rsp
-; X64-NEXT: popq %rbp
+; X64-NEXT: movl $1084227584, -40(%rsp,%rdi,4) ## imm = 0x40A00000
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; X64-NEXT: retq
entry:
%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
index 98a35c4a7934..f6ab64975cd3 100644
--- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
@@ -9,8 +9,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
-; X32-SSE-NEXT: andl $-128, %esp
-; X32-SSE-NEXT: subl $384, %esp # imm = 0x180
+; X32-SSE-NEXT: andl $-16, %esp
+; X32-SSE-NEXT: subl $272, %esp # imm = 0x110
; X32-SSE-NEXT: movl 88(%ebp), %ecx
; X32-SSE-NEXT: movdqa 72(%ebp), %xmm0
; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
@@ -43,33 +43,29 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
;
; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pushq %rbp
-; X64-SSE-NEXT: movq %rsp, %rbp
-; X64-SSE-NEXT: andq $-128, %rsp
-; X64-SSE-NEXT: subq $256, %rsp # imm = 0x100
+; X64-SSE-NEXT: pushq %rax
; X64-SSE-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X64-SSE-NEXT: xorps %xmm0, %xmm0
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, (%rsp)
-; X64-SSE-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: andl $15, %edi
-; X64-SSE-NEXT: movq (%rsp,%rdi,8), %rax
-; X64-SSE-NEXT: movq %rbp, %rsp
-; X64-SSE-NEXT: popq %rbp
+; X64-SSE-NEXT: movq -128(%rsp,%rdi,8), %rax
+; X64-SSE-NEXT: popq %rcx
; X64-SSE-NEXT: retq
;
; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
-; X32-AVX-NEXT: andl $-128, %esp
-; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
+; X32-AVX-NEXT: andl $-32, %esp
+; X32-AVX-NEXT: subl $288, %esp # imm = 0x120
; X32-AVX-NEXT: movl 40(%ebp), %ecx
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -96,8 +92,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbp
; X64-AVX-NEXT: movq %rsp, %rbp
-; X64-AVX-NEXT: andq $-128, %rsp
-; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $160, %rsp
; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
More information about the llvm-commits
mailing list