[llvm] [SelectionDAG] Use unaligned store to legalize `EXTRACT_VECTOR_ELT` type (PR #98176)
Manish Kausik H via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 26 03:39:16 PDT 2024
https://github.com/Nirhar updated https://github.com/llvm/llvm-project/pull/98176
>From 6a747762e1aa75608f6e356da7fafa5fe6cffb37 Mon Sep 17 00:00:00 2001
From: Manish Kausik H <hmamishkausik at gmail.com>
Date: Fri, 26 Jul 2024 16:07:26 +0530
Subject: [PATCH] [SelectionDAG] Use unaligned store to legalize
`EXTRACT_VECTOR_ELT` type
This patch sets the alignment of store instructions generated during type
legalization of extractelement instruction, after considering stack alignment.
Fixes #98044
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +
.../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll | 141 ++----
llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll | 117 ++---
.../CodeGen/RISCV/rvv/extractelt-int-rv32.ll | 44 +-
.../CodeGen/RISCV/rvv/extractelt-int-rv64.ll | 44 +-
.../RISCV/rvv/fixed-vectors-extract-i1.ll | 146 +-----
.../RISCV/rvv/fixed-vectors-extract.ll | 75 +--
.../rvv/fixed-vectors-insert-subvector.ll | 170 ++-----
.../CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 75 +--
.../RISCV/rvv/named-vector-shuffle-reverse.ll | 103 ++--
llvm/test/CodeGen/RISCV/rvv/pr88576.ll | 22 +-
llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll | 22 +-
.../test/CodeGen/X86/avx512-insert-extract.ll | 446 +++++++++---------
llvm/test/CodeGen/X86/avx512fp16-mov.ll | 26 +-
llvm/test/CodeGen/X86/gep-expanded-vector.ll | 26 +-
llvm/test/CodeGen/X86/i64-mem-copy.ll | 40 +-
...igned_extract_from_vector_through_stack.ll | 18 +
llvm/test/CodeGen/X86/vector-extend-inreg.ll | 74 ++-
19 files changed, 580 insertions(+), 1013 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 79f90bae1d8d6..04f18e0859c1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2481,6 +2481,8 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
if (RedAlign2 < RedAlign)
RedAlign = RedAlign2;
+ // If RedAlign is still greater than StackAlign, clamp it down to StackAlign
+ RedAlign = std::min(RedAlign, StackAlign);
}
return RedAlign;
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 0cabfa9aea0e4..69d80ae4041bf 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -25,7 +25,7 @@
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
-; GCN: ScratchSize: 640
+; GCN: ScratchSize: 544
define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
bb:
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 3b7952f9f5e6d..2f2cecfa01aad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -585,19 +585,13 @@ define double @extractelt_nxv16f64_0(<vscale x 16 x double> %v) {
define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
; RV32-LABEL: extractelt_nxv16f64_neg1:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: .cfi_def_cfa_offset 80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a2, a1, 3
@@ -606,27 +600,21 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a0, a1, a0
; RV32-NEXT: fld fa0, -8(a0)
-; RV32-NEXT: addi sp, s0, -80
-; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: extractelt_nxv16f64_neg1:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: .cfi_def_cfa_offset 80
-; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 80
-; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vs8r.v v8, (a0)
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a1, a2, 3
@@ -643,10 +631,10 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a0, a0, a2
; RV64-NEXT: fld fa0, 0(a0)
-; RV64-NEXT: addi sp, s0, -80
-; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%r = extractelement <vscale x 16 x double> %v, i32 -1
ret double %r
@@ -664,75 +652,34 @@ define double @extractelt_nxv16f64_imm(<vscale x 16 x double> %v) {
}
define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %idx) {
-; RV32-LABEL: extractelt_nxv16f64_idx:
-; RV32: # %bb.0:
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: bltu a0, a2, .LBB54_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: mv a0, a2
-; RV32-NEXT: .LBB54_2:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: .cfi_def_cfa_offset 80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: addi a2, sp, 64
-; RV32-NEXT: add a0, a2, a0
-; RV32-NEXT: vs8r.v v8, (a2)
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: vs8r.v v16, (a1)
-; RV32-NEXT: fld fa0, 0(a0)
-; RV32-NEXT: addi sp, s0, -80
-; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 80
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_nxv16f64_idx:
-; RV64: # %bb.0:
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 1
-; RV64-NEXT: addi a2, a2, -1
-; RV64-NEXT: bltu a0, a2, .LBB54_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a2
-; RV64-NEXT: .LBB54_2:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: .cfi_def_cfa_offset 80
-; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 80
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: addi a2, sp, 64
-; RV64-NEXT: add a0, a2, a0
-; RV64-NEXT: vs8r.v v8, (a2)
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: vs8r.v v16, (a1)
-; RV64-NEXT: fld fa0, 0(a0)
-; RV64-NEXT: addi sp, s0, -80
-; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 80
-; RV64-NEXT: ret
+; CHECK-LABEL: extractelt_nxv16f64_idx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: bltu a0, a2, .LBB54_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: .LBB54_2:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: vs8r.v v8, (a2)
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: fld fa0, 0(a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x double> %v, i32 %idx
ret double %r
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 14719e190a693..44ef639b68e29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -122,85 +122,41 @@ define i1 @extractelt_nxv64i1(ptr %x, i64 %idx) nounwind {
}
define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
-; RV32-LABEL: extractelt_nxv128i1:
-; RV32: # %bb.0:
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a3, a2, 4
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: bltu a1, a3, .LBB7_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: mv a1, a3
-; RV32-NEXT: .LBB7_2:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: sub sp, sp, a3
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vl8r.v v16, (a3)
-; RV32-NEXT: vl8r.v v24, (a0)
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: vsetvli a3, zero, e8, m8, ta, ma
-; RV32-NEXT: vmseq.vi v8, v16, 0
-; RV32-NEXT: vmseq.vi v0, v24, 0
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: vmerge.vim v24, v16, 1, v0
-; RV32-NEXT: vs8r.v v24, (a0)
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v8, v16, 1, v0
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: lbu a0, 0(a1)
-; RV32-NEXT: addi sp, s0, -80
-; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 80
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_nxv128i1:
-; RV64: # %bb.0:
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 4
-; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: bltu a1, a3, .LBB7_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a1, a3
-; RV64-NEXT: .LBB7_2:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 80
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: slli a3, a3, 4
-; RV64-NEXT: sub sp, sp, a3
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vl8r.v v16, (a3)
-; RV64-NEXT: vl8r.v v24, (a0)
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: vsetvli a3, zero, e8, m8, ta, ma
-; RV64-NEXT: vmseq.vi v8, v16, 0
-; RV64-NEXT: vmseq.vi v0, v24, 0
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vim v24, v16, 1, v0
-; RV64-NEXT: vs8r.v v24, (a0)
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: vmv1r.v v0, v8
-; RV64-NEXT: vmerge.vim v8, v16, 1, v0
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: lbu a0, 0(a1)
-; RV64-NEXT: addi sp, s0, -80
-; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 80
-; RV64-NEXT: ret
+; CHECK-LABEL: extractelt_nxv128i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a3, a2, 4
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: bltu a1, a3, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 4
+; CHECK-NEXT: sub sp, sp, a3
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vl8r.v v16, (a3)
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT: vmseq.vi v8, v16, 0
+; CHECK-NEXT: vmseq.vi v0, v24, 0
+; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT: vs8r.v v24, (a0)
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: lbu a0, 0(a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%a = load <vscale x 128 x i8>, ptr %x
%b = icmp eq <vscale x 128 x i8> %a, zeroinitializer
%c = extractelement <vscale x 128 x i1> %b, i64 %idx
@@ -311,3 +267,6 @@ define i1 @extractelt_nxv64i1_idx0(ptr %x) nounwind {
%c = extractelement <vscale x 64 x i1> %b, i64 0
ret i1 %c
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index df9949e617b80..9b157c879b40b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -863,19 +863,13 @@ define i32 @extractelt_nxv32i32_0(<vscale x 32 x i32> %v) {
define i32 @extractelt_nxv32i32_neg1(<vscale x 32 x i32> %v) {
; CHECK-LABEL: extractelt_nxv32i32_neg1:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; CHECK-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -4
-; CHECK-NEXT: .cfi_offset s0, -8
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: addi a0, sp, 64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a2, a1, 3
@@ -884,10 +878,10 @@ define i32 @extractelt_nxv32i32_neg1(<vscale x 32 x i32> %v) {
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: lw a0, -4(a0)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; CHECK-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i32> %v, i32 -1
ret i32 %r
@@ -914,30 +908,24 @@ define i32 @extractelt_nxv32i32_idx(<vscale x 32 x i32> %v, i32 %idx) {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: .LBB74_2:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; CHECK-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -4
-; CHECK-NEXT: .cfi_offset s0, -8
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: andi sp, sp, -64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: addi a2, sp, 64
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vs8r.v v16, (a1)
; CHECK-NEXT: lw a0, 0(a0)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; CHECK-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 32 x i32> %v, i32 %idx
ret i32 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index a96cf5807e6c1..c1865e1ba66e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -849,19 +849,13 @@ define i64 @extractelt_nxv16i64_0(<vscale x 16 x i64> %v) {
define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
; CHECK-LABEL: extractelt_nxv16i64_neg1:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -8
-; CHECK-NEXT: .cfi_offset s0, -16
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: addi a0, sp, 64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a1, a2, 3
@@ -878,10 +872,10 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: ld a0, 0(a0)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i64> %v, i32 -1
ret i64 %r
@@ -908,30 +902,24 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: .LBB76_2:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -8
-; CHECK-NEXT: .cfi_offset s0, -16
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: andi sp, sp, -64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: addi a2, sp, 64
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vs8r.v v16, (a1)
; CHECK-NEXT: ld a0, 0(a0)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%r = extractelement <vscale x 16 x i64> %v, i32 %idx
ret i64 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 386c71cf665ce..c9a48c4898e5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -317,129 +317,29 @@ define i1 @extractelt_v128i1(ptr %x, i64 %idx) nounwind {
}
define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
-; RV32-LABEL: extractelt_v256i1:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -384
-; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 384
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: andi a1, a1, 255
-; RV32-NEXT: li a2, 128
-; RV32-NEXT: addi a3, a0, 128
-; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT: vle8.v v16, (a3)
-; RV32-NEXT: vle8.v v24, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: vmseq.vi v8, v16, 0
-; RV32-NEXT: vmseq.vi v0, v24, 0
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: vmerge.vim v24, v16, 1, v0
-; RV32-NEXT: vse8.v v24, (a0)
-; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v8, v16, 1, v0
-; RV32-NEXT: addi a0, sp, 128
-; RV32-NEXT: vse8.v v8, (a0)
-; RV32-NEXT: lbu a0, 0(a1)
-; RV32-NEXT: addi sp, s0, -384
-; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 384
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_v256i1:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -384
-; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 384
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: andi a1, a1, 255
-; RV64-NEXT: li a2, 128
-; RV64-NEXT: addi a3, a0, 128
-; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT: vle8.v v16, (a3)
-; RV64-NEXT: vle8.v v24, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: vmseq.vi v8, v16, 0
-; RV64-NEXT: vmseq.vi v0, v24, 0
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vim v24, v16, 1, v0
-; RV64-NEXT: vse8.v v24, (a0)
-; RV64-NEXT: vmv1r.v v0, v8
-; RV64-NEXT: vmerge.vim v8, v16, 1, v0
-; RV64-NEXT: addi a0, sp, 128
-; RV64-NEXT: vse8.v v8, (a0)
-; RV64-NEXT: lbu a0, 0(a1)
-; RV64-NEXT: addi sp, s0, -384
-; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 384
-; RV64-NEXT: ret
-;
-; RV32ZBS-LABEL: extractelt_v256i1:
-; RV32ZBS: # %bb.0:
-; RV32ZBS-NEXT: addi sp, sp, -384
-; RV32ZBS-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
-; RV32ZBS-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
-; RV32ZBS-NEXT: addi s0, sp, 384
-; RV32ZBS-NEXT: andi sp, sp, -128
-; RV32ZBS-NEXT: andi a1, a1, 255
-; RV32ZBS-NEXT: li a2, 128
-; RV32ZBS-NEXT: addi a3, a0, 128
-; RV32ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; RV32ZBS-NEXT: vle8.v v16, (a3)
-; RV32ZBS-NEXT: vle8.v v24, (a0)
-; RV32ZBS-NEXT: mv a0, sp
-; RV32ZBS-NEXT: add a1, a0, a1
-; RV32ZBS-NEXT: vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT: vmseq.vi v0, v24, 0
-; RV32ZBS-NEXT: vmv.v.i v16, 0
-; RV32ZBS-NEXT: vmerge.vim v24, v16, 1, v0
-; RV32ZBS-NEXT: vse8.v v24, (a0)
-; RV32ZBS-NEXT: vmv1r.v v0, v8
-; RV32ZBS-NEXT: vmerge.vim v8, v16, 1, v0
-; RV32ZBS-NEXT: addi a0, sp, 128
-; RV32ZBS-NEXT: vse8.v v8, (a0)
-; RV32ZBS-NEXT: lbu a0, 0(a1)
-; RV32ZBS-NEXT: addi sp, s0, -384
-; RV32ZBS-NEXT: lw ra, 380(sp) # 4-byte Folded Reload
-; RV32ZBS-NEXT: lw s0, 376(sp) # 4-byte Folded Reload
-; RV32ZBS-NEXT: addi sp, sp, 384
-; RV32ZBS-NEXT: ret
-;
-; RV64ZBS-LABEL: extractelt_v256i1:
-; RV64ZBS: # %bb.0:
-; RV64ZBS-NEXT: addi sp, sp, -384
-; RV64ZBS-NEXT: sd ra, 376(sp) # 8-byte Folded Spill
-; RV64ZBS-NEXT: sd s0, 368(sp) # 8-byte Folded Spill
-; RV64ZBS-NEXT: addi s0, sp, 384
-; RV64ZBS-NEXT: andi sp, sp, -128
-; RV64ZBS-NEXT: andi a1, a1, 255
-; RV64ZBS-NEXT: li a2, 128
-; RV64ZBS-NEXT: addi a3, a0, 128
-; RV64ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; RV64ZBS-NEXT: vle8.v v16, (a3)
-; RV64ZBS-NEXT: vle8.v v24, (a0)
-; RV64ZBS-NEXT: mv a0, sp
-; RV64ZBS-NEXT: add a1, a0, a1
-; RV64ZBS-NEXT: vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT: vmseq.vi v0, v24, 0
-; RV64ZBS-NEXT: vmv.v.i v16, 0
-; RV64ZBS-NEXT: vmerge.vim v24, v16, 1, v0
-; RV64ZBS-NEXT: vse8.v v24, (a0)
-; RV64ZBS-NEXT: vmv1r.v v0, v8
-; RV64ZBS-NEXT: vmerge.vim v8, v16, 1, v0
-; RV64ZBS-NEXT: addi a0, sp, 128
-; RV64ZBS-NEXT: vse8.v v8, (a0)
-; RV64ZBS-NEXT: lbu a0, 0(a1)
-; RV64ZBS-NEXT: addi sp, s0, -384
-; RV64ZBS-NEXT: ld ra, 376(sp) # 8-byte Folded Reload
-; RV64ZBS-NEXT: ld s0, 368(sp) # 8-byte Folded Reload
-; RV64ZBS-NEXT: addi sp, sp, 384
-; RV64ZBS-NEXT: ret
+; CHECK-LABEL: extractelt_v256i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -256
+; CHECK-NEXT: andi a1, a1, 255
+; CHECK-NEXT: li a2, 128
+; CHECK-NEXT: addi a3, a0, 128
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8.v v16, (a3)
+; CHECK-NEXT: vle8.v v24, (a0)
+; CHECK-NEXT: mv a0, sp
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vmseq.vi v8, v16, 0
+; CHECK-NEXT: vmseq.vi v0, v24, 0
+; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT: vse8.v v24, (a0)
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT: addi a0, sp, 128
+; CHECK-NEXT: vse8.v v8, (a0)
+; CHECK-NEXT: lbu a0, 0(a1)
+; CHECK-NEXT: addi sp, sp, 256
+; CHECK-NEXT: ret
%a = load <256 x i8>, ptr %x
%b = icmp eq <256 x i8> %a, zeroinitializer
%c = extractelement <256 x i1> %b, i64 %idx
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index d309da6df7dc7..7a46465d7ecb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -700,61 +700,26 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
}
define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
-; RV32-LABEL: extractelt_v64i32_idx:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -384
-; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 384
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: andi a1, a1, 63
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: addi a3, a0, 128
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a3)
-; RV32-NEXT: vle32.v v16, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: vadd.vv v8, v8, v8
-; RV32-NEXT: vadd.vv v16, v16, v16
-; RV32-NEXT: vse32.v v16, (a0)
-; RV32-NEXT: addi a0, sp, 128
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: lw a0, 0(a1)
-; RV32-NEXT: addi sp, s0, -384
-; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 384
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_v64i32_idx:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -384
-; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 384
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: andi a1, a1, 63
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: li a2, 32
-; RV64-NEXT: addi a3, a0, 128
-; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a3)
-; RV64-NEXT: vle32.v v16, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: vadd.vv v8, v8, v8
-; RV64-NEXT: vadd.vv v16, v16, v16
-; RV64-NEXT: vse32.v v16, (a0)
-; RV64-NEXT: addi a0, sp, 128
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: lw a0, 0(a1)
-; RV64-NEXT: addi sp, s0, -384
-; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 384
-; RV64-NEXT: ret
+; CHECK-LABEL: extractelt_v64i32_idx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -256
+; CHECK-NEXT: andi a1, a1, 63
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: addi a3, a0, 128
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a3)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: mv a0, sp
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: vadd.vv v16, v16, v16
+; CHECK-NEXT: vse32.v v16, (a0)
+; CHECK-NEXT: addi a0, sp, 128
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: lw a0, 0(a1)
+; CHECK-NEXT: addi sp, sp, 256
+; CHECK-NEXT: ret
%a = load <64 x i32>, ptr %x
%b = add <64 x i32> %a, %a
%c = extractelement <64 x i32> %b, i32 %idx
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index e81f686a28303..51c6a4d286505 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -733,127 +733,52 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
-; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
-; RV32VLA: # %bb.0:
-; RV32VLA-NEXT: addi sp, sp, -80
-; RV32VLA-NEXT: .cfi_def_cfa_offset 80
-; RV32VLA-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32VLA-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32VLA-NEXT: .cfi_offset ra, -4
-; RV32VLA-NEXT: .cfi_offset s0, -8
-; RV32VLA-NEXT: addi s0, sp, 80
-; RV32VLA-NEXT: .cfi_def_cfa s0, 0
-; RV32VLA-NEXT: csrr a2, vlenb
-; RV32VLA-NEXT: slli a2, a2, 4
-; RV32VLA-NEXT: sub sp, sp, a2
-; RV32VLA-NEXT: andi sp, sp, -64
-; RV32VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32VLA-NEXT: vle64.v v8, (a0)
-; RV32VLA-NEXT: addi a0, sp, 128
-; RV32VLA-NEXT: vse64.v v8, (a0)
-; RV32VLA-NEXT: csrr a0, vlenb
-; RV32VLA-NEXT: slli a0, a0, 3
-; RV32VLA-NEXT: addi a2, sp, 64
-; RV32VLA-NEXT: add a3, a2, a0
-; RV32VLA-NEXT: vl8re64.v v8, (a3)
-; RV32VLA-NEXT: vl8re64.v v16, (a2)
-; RV32VLA-NEXT: add a0, a1, a0
-; RV32VLA-NEXT: vs8r.v v8, (a0)
-; RV32VLA-NEXT: vs8r.v v16, (a1)
-; RV32VLA-NEXT: addi sp, s0, -80
-; RV32VLA-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32VLA-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32VLA-NEXT: addi sp, sp, 80
-; RV32VLA-NEXT: ret
-;
-; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
-; RV64VLA: # %bb.0:
-; RV64VLA-NEXT: addi sp, sp, -80
-; RV64VLA-NEXT: .cfi_def_cfa_offset 80
-; RV64VLA-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64VLA-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64VLA-NEXT: .cfi_offset ra, -8
-; RV64VLA-NEXT: .cfi_offset s0, -16
-; RV64VLA-NEXT: addi s0, sp, 80
-; RV64VLA-NEXT: .cfi_def_cfa s0, 0
-; RV64VLA-NEXT: csrr a2, vlenb
-; RV64VLA-NEXT: slli a2, a2, 4
-; RV64VLA-NEXT: sub sp, sp, a2
-; RV64VLA-NEXT: andi sp, sp, -64
-; RV64VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64VLA-NEXT: vle64.v v8, (a0)
-; RV64VLA-NEXT: addi a0, sp, 128
-; RV64VLA-NEXT: vse64.v v8, (a0)
-; RV64VLA-NEXT: csrr a0, vlenb
-; RV64VLA-NEXT: slli a0, a0, 3
-; RV64VLA-NEXT: addi a2, sp, 64
-; RV64VLA-NEXT: add a3, a2, a0
-; RV64VLA-NEXT: vl8re64.v v8, (a3)
-; RV64VLA-NEXT: vl8re64.v v16, (a2)
-; RV64VLA-NEXT: add a0, a1, a0
-; RV64VLA-NEXT: vs8r.v v8, (a0)
-; RV64VLA-NEXT: vs8r.v v16, (a1)
-; RV64VLA-NEXT: addi sp, s0, -80
-; RV64VLA-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64VLA-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64VLA-NEXT: addi sp, sp, 80
-; RV64VLA-NEXT: ret
-;
-; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
-; RV32VLS: # %bb.0:
-; RV32VLS-NEXT: addi sp, sp, -80
-; RV32VLS-NEXT: .cfi_def_cfa_offset 80
-; RV32VLS-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32VLS-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32VLS-NEXT: .cfi_offset ra, -4
-; RV32VLS-NEXT: .cfi_offset s0, -8
-; RV32VLS-NEXT: addi s0, sp, 80
-; RV32VLS-NEXT: .cfi_def_cfa s0, 0
-; RV32VLS-NEXT: addi sp, sp, -256
-; RV32VLS-NEXT: andi sp, sp, -64
-; RV32VLS-NEXT: vl1re64.v v8, (a0)
-; RV32VLS-NEXT: addi a0, sp, 128
-; RV32VLS-NEXT: vs1r.v v8, (a0)
-; RV32VLS-NEXT: addi a0, sp, 64
-; RV32VLS-NEXT: addi a2, sp, 192
-; RV32VLS-NEXT: vl8re64.v v8, (a2)
-; RV32VLS-NEXT: vl8re64.v v16, (a0)
-; RV32VLS-NEXT: addi a0, a1, 128
-; RV32VLS-NEXT: vs8r.v v8, (a0)
-; RV32VLS-NEXT: vs8r.v v16, (a1)
-; RV32VLS-NEXT: addi sp, s0, -80
-; RV32VLS-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32VLS-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32VLS-NEXT: addi sp, sp, 80
-; RV32VLS-NEXT: ret
+; VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; VLA: # %bb.0:
+; VLA-NEXT: addi sp, sp, -16
+; VLA-NEXT: .cfi_def_cfa_offset 16
+; VLA-NEXT: csrr a2, vlenb
+; VLA-NEXT: slli a2, a2, 4
+; VLA-NEXT: sub sp, sp, a2
+; VLA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT: vle64.v v8, (a0)
+; VLA-NEXT: addi a0, sp, 80
+; VLA-NEXT: vse64.v v8, (a0)
+; VLA-NEXT: csrr a0, vlenb
+; VLA-NEXT: slli a0, a0, 3
+; VLA-NEXT: addi a2, sp, 16
+; VLA-NEXT: add a3, a2, a0
+; VLA-NEXT: vl8re64.v v8, (a3)
+; VLA-NEXT: vl8re64.v v16, (a2)
+; VLA-NEXT: add a0, a1, a0
+; VLA-NEXT: vs8r.v v8, (a0)
+; VLA-NEXT: vs8r.v v16, (a1)
+; VLA-NEXT: csrr a0, vlenb
+; VLA-NEXT: slli a0, a0, 4
+; VLA-NEXT: add sp, sp, a0
+; VLA-NEXT: addi sp, sp, 16
+; VLA-NEXT: ret
;
-; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
-; RV64VLS: # %bb.0:
-; RV64VLS-NEXT: addi sp, sp, -80
-; RV64VLS-NEXT: .cfi_def_cfa_offset 80
-; RV64VLS-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64VLS-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64VLS-NEXT: .cfi_offset ra, -8
-; RV64VLS-NEXT: .cfi_offset s0, -16
-; RV64VLS-NEXT: addi s0, sp, 80
-; RV64VLS-NEXT: .cfi_def_cfa s0, 0
-; RV64VLS-NEXT: addi sp, sp, -256
-; RV64VLS-NEXT: andi sp, sp, -64
-; RV64VLS-NEXT: vl1re64.v v8, (a0)
-; RV64VLS-NEXT: addi a0, sp, 128
-; RV64VLS-NEXT: vs1r.v v8, (a0)
-; RV64VLS-NEXT: addi a0, sp, 64
-; RV64VLS-NEXT: addi a2, sp, 192
-; RV64VLS-NEXT: vl8re64.v v8, (a2)
-; RV64VLS-NEXT: vl8re64.v v16, (a0)
-; RV64VLS-NEXT: addi a0, a1, 128
-; RV64VLS-NEXT: vs8r.v v8, (a0)
-; RV64VLS-NEXT: vs8r.v v16, (a1)
-; RV64VLS-NEXT: addi sp, s0, -80
-; RV64VLS-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64VLS-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64VLS-NEXT: addi sp, sp, 80
-; RV64VLS-NEXT: ret
+; VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; VLS: # %bb.0:
+; VLS-NEXT: addi sp, sp, -16
+; VLS-NEXT: .cfi_def_cfa_offset 16
+; VLS-NEXT: addi sp, sp, -256
+; VLS-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; VLS-NEXT: vl1re64.v v8, (a0)
+; VLS-NEXT: addi a0, sp, 80
+; VLS-NEXT: vs1r.v v8, (a0)
+; VLS-NEXT: addi a0, sp, 16
+; VLS-NEXT: addi a2, sp, 144
+; VLS-NEXT: vl8re64.v v8, (a2)
+; VLS-NEXT: vl8re64.v v16, (a0)
+; VLS-NEXT: addi a0, a1, 128
+; VLS-NEXT: vs8r.v v8, (a0)
+; VLS-NEXT: vs8r.v v16, (a1)
+; VLS-NEXT: addi sp, sp, 256
+; VLS-NEXT: addi sp, sp, 16
+; VLS-NEXT: ret
%sv = load <2 x i64>, ptr %psv
%v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
store <vscale x 16 x i64> %v, ptr %out
@@ -894,3 +819,8 @@ define <4 x i32> @insert_extract_v8i32_v2i32_0(<2 x i32> %v) {
%2 = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %1, i64 0)
ret <4 x i32> %2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32VLA: {{.*}}
+; RV32VLS: {{.*}}
+; RV64VLA: {{.*}}
+; RV64VLS: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 776a1e9bab6b2..1cc287dbbcd93 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -109,63 +109,24 @@ define <64 x i32> @insertelt_v64i32_63(<64 x i32> %a, i32 %y) {
}
define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx) {
-; RV32-LABEL: insertelt_v64i32_idx:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -384
-; RV32-NEXT: .cfi_def_cfa_offset 384
-; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 384
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: andi a1, a1, 63
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: mv a2, sp
-; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: addi a3, sp, 128
-; RV32-NEXT: li a4, 32
-; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v16, (a3)
-; RV32-NEXT: vse32.v v8, (a2)
-; RV32-NEXT: sw a0, 0(a1)
-; RV32-NEXT: vle32.v v8, (a2)
-; RV32-NEXT: vle32.v v16, (a3)
-; RV32-NEXT: addi sp, s0, -384
-; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 384
-; RV32-NEXT: ret
-;
-; RV64-LABEL: insertelt_v64i32_idx:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -384
-; RV64-NEXT: .cfi_def_cfa_offset 384
-; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 384
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: andi a1, a1, 63
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: mv a2, sp
-; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: addi a3, sp, 128
-; RV64-NEXT: li a4, 32
-; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v16, (a3)
-; RV64-NEXT: vse32.v v8, (a2)
-; RV64-NEXT: sw a0, 0(a1)
-; RV64-NEXT: vle32.v v8, (a2)
-; RV64-NEXT: vle32.v v16, (a3)
-; RV64-NEXT: addi sp, s0, -384
-; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 384
-; RV64-NEXT: ret
+; CHECK-LABEL: insertelt_v64i32_idx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -256
+; CHECK-NEXT: .cfi_def_cfa_offset 256
+; CHECK-NEXT: andi a1, a1, 63
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: mv a2, sp
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: addi a3, sp, 128
+; CHECK-NEXT: li a4, 32
+; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; CHECK-NEXT: vse32.v v16, (a3)
+; CHECK-NEXT: vse32.v v8, (a2)
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: vle32.v v8, (a2)
+; CHECK-NEXT: vle32.v v16, (a3)
+; CHECK-NEXT: addi sp, sp, 256
+; CHECK-NEXT: ret
%b = insertelement <64 x i32> %a, i32 %y, i32 %idx
ret <64 x i32> %b
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 96094eea631ba..db2912f465d37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -1668,77 +1668,35 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
}
define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
-; RV32-LABEL: reverse_nxv12i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: .cfi_def_cfa_offset 80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: addi a1, a0, -1
-; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a1
-; RV32-NEXT: vrgather.vv v0, v16, v24
-; RV32-NEXT: vmv4r.v v16, v4
-; RV32-NEXT: vrgather.vv v0, v8, v24
-; RV32-NEXT: vmv4r.v v20, v0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: addi a1, sp, 64
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: vs4r.v v4, (a0)
-; RV32-NEXT: vs8r.v v16, (a1)
-; RV32-NEXT: vl8re64.v v16, (a0)
-; RV32-NEXT: vl8re64.v v8, (a1)
-; RV32-NEXT: addi sp, s0, -80
-; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 80
-; RV32-NEXT: ret
-;
-; RV64-LABEL: reverse_nxv12i64:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: .cfi_def_cfa_offset 80
-; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 80
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: addi a1, a0, -1
-; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a1
-; RV64-NEXT: vrgather.vv v0, v16, v24
-; RV64-NEXT: vmv4r.v v16, v4
-; RV64-NEXT: vrgather.vv v0, v8, v24
-; RV64-NEXT: vmv4r.v v20, v0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: addi a1, sp, 64
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: vs4r.v v4, (a0)
-; RV64-NEXT: vs8r.v v16, (a1)
-; RV64-NEXT: vl8re64.v v16, (a0)
-; RV64-NEXT: vl8re64.v v8, (a1)
-; RV64-NEXT: addi sp, s0, -80
-; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 80
-; RV64-NEXT: ret
+; CHECK-LABEL: reverse_nxv12i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v24
+; CHECK-NEXT: vrsub.vx v24, v24, a1
+; CHECK-NEXT: vrgather.vv v0, v16, v24
+; CHECK-NEXT: vmv4r.v v16, v4
+; CHECK-NEXT: vrgather.vv v0, v8, v24
+; CHECK-NEXT: vmv4r.v v20, v0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: vs4r.v v4, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vl8re64.v v16, (a0)
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%res = call <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
ret <vscale x 12 x i64> %res
}
@@ -1789,3 +1747,6 @@ declare <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double>
declare <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64>)
declare <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64>)
declare <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
index b6e0d1e2ff4ae..fbfcdbcd5afab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
@@ -11,19 +11,13 @@ define i1 @foo(<vscale x 16 x i8> %x, i64 %y) {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -8
-; CHECK-NEXT: .cfi_offset s0, -16
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: addi a2, sp, 64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a2, a1
@@ -36,10 +30,10 @@ define i1 @foo(<vscale x 16 x i8> %x, i64 %y) {
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: lbu a0, 0(a0)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%a = bitcast <vscale x 16 x i8> %x to <vscale x 128 x i1>
%b = extractelement <vscale x 128 x i1> %a, i64 %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index d0f2ce1ca8004..7fadec6790410 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -495,19 +495,13 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB32_2:
-; CHECK-NEXT: addi sp, sp, -80
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset ra, -8
-; CHECK-NEXT: .cfi_offset s0, -16
-; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: .cfi_def_cfa s0, 0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a3, a3, 4
; CHECK-NEXT: sub sp, sp, a3
-; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: addi a3, sp, 64
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a3, sp, 16
; CHECK-NEXT: add a4, a0, a3
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: li a5, -1
@@ -524,10 +518,10 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
; CHECK-NEXT: vle8.v v16, (a1)
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a3)
-; CHECK-NEXT: addi sp, s0, -80
-; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; CHECK-NEXT: addi sp, sp, 80
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%dst = call <vscale x 128 x i8> @llvm.experimental.vp.reverse.nxv128i8(<vscale x 128 x i8> %src, <vscale x 128 x i1> splat (i1 1), i32 %evl)
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 2a77d0238721c..4e5a1b2651ec5 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1753,44 +1753,41 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) n
define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) nounwind {
; KNL-LABEL: test_insertelement_variable_v96i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $192, %rsp
-; KNL-NEXT: movl 744(%rbp), %eax
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
; KNL-NEXT: andl $127, %eax
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm1
@@ -1800,91 +1797,91 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n
; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2
; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
-; KNL-NEXT: cmpb $0, 736(%rbp)
-; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa64 %zmm1, (%rsp)
-; KNL-NEXT: setne (%rsp,%rax)
-; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
+; KNL-NEXT: cmpb $0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: setne -128(%rsp,%rax)
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1892,21 +1889,21 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
@@ -1914,49 +1911,45 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rsi, %rdx
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: popq %rcx
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v96i1:
; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $192, %rsp
+; SKX-NEXT: pushq %rax
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: vmovd %edi, %xmm1
; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
@@ -1964,85 +1957,84 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n
; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
-; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
-; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; SKX-NEXT: movl 744(%rbp), %eax
+; SKX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; SKX-NEXT: andl $127, %eax
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
-; SKX-NEXT: cmpb $0, 736(%rbp)
+; SKX-NEXT: cmpb $0, {{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovm2b %k1, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovm2b %k0, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
-; SKX-NEXT: setne (%rsp,%rax)
-; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: setne -128(%rsp,%rax)
+; SKX-NEXT: vpsllw $7, -{{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
-; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpsllw $7, -{{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k1
; SKX-NEXT: kmovq %k1, %rax
; SKX-NEXT: kmovq %k0, %rdx
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
+; SKX-NEXT: popq %rcx
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <96 x i8> %a, zeroinitializer
@@ -2055,10 +2047,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n
define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) nounwind {
; KNL-LABEL: test_insertelement_variable_v128i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $192, %rsp
+; KNL-NEXT: pushq %rax
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
@@ -2073,24 +2062,24 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: andl $127, %esi
; KNL-NEXT: testb %dil, %dil
-; KNL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa64 %zmm0, (%rsp)
-; KNL-NEXT: setne (%rsp,%rsi)
-; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
+; KNL-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: setne -128(%rsp,%rsi)
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -2098,21 +2087,21 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpmovsxbd -{{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
@@ -2120,35 +2109,30 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rsi, %rdx
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: popq %rcx
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v128i1:
; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $192, %rsp
+; SKX-NEXT: pushq %rax
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
; SKX-NEXT: andl $127, %esi
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: vpmovm2b %k1, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovm2b %k0, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
-; SKX-NEXT: setne (%rsp,%rsi)
-; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: setne -128(%rsp,%rsi)
+; SKX-NEXT: vpsllw $7, -{{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
-; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpsllw $7, -{{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k1
; SKX-NEXT: kmovq %k1, %rax
; SKX-NEXT: kmovq %k0, %rdx
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
+; SKX-NEXT: popq %rcx
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <128 x i8> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index f4eb5b952ae43..42b639d4815a9 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1385,32 +1385,24 @@ define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind {
define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind {
; X64-LABEL: extract_f16_9:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: andq $-64, %rsp
-; X64-NEXT: subq $192, %rsp
+; X64-NEXT: pushq %rax
; X64-NEXT: andl $63, %edi
-; X64-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
-; X64-NEXT: vmovaps %zmm0, (%rsp)
+; X64-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: movq %rbp, %rsp
-; X64-NEXT: popq %rbp
+; X64-NEXT: popq %rax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: extract_f16_9:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-64, %esp
-; X86-NEXT: subl $192, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: subl $128, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $63, %eax
-; X86-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: vmovaps %zmm0, (%esp)
+; X86-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: vmovups %zmm0, (%esp)
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: addl $128, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%res = extractelement <64 x half> %x, i64 %idx
diff --git a/llvm/test/CodeGen/X86/gep-expanded-vector.ll b/llvm/test/CodeGen/X86/gep-expanded-vector.ll
index 943cd3610c9d3..98bde25bb6177 100644
--- a/llvm/test/CodeGen/X86/gep-expanded-vector.ll
+++ b/llvm/test/CodeGen/X86/gep-expanded-vector.ll
@@ -6,10 +6,7 @@
define ptr @malloc_init_state(<64 x ptr> %tmp, i32 %ind) nounwind {
; CHECK-LABEL: malloc_init_state:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: andq $-64, %rsp
-; CHECK-NEXT: subq $576, %rsp # imm = 0x240
+; CHECK-NEXT: subq $392, %rsp # imm = 0x188
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm8 = [16,16,16,16,16,16,16,16]
; CHECK-NEXT: vpaddq %zmm8, %zmm0, %zmm0
@@ -20,18 +17,17 @@ define ptr @malloc_init_state(<64 x ptr> %tmp, i32 %ind) nounwind {
; CHECK-NEXT: vpaddq %zmm8, %zmm5, %zmm5
; CHECK-NEXT: vpaddq %zmm8, %zmm6, %zmm6
; CHECK-NEXT: vpaddq %zmm8, %zmm7, %zmm7
-; CHECK-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm7, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm6, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm5, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm4, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm3, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $63, %edi
-; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
-; CHECK-NEXT: movq %rbp, %rsp
-; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: movq -128(%rsp,%rdi,8), %rax
+; CHECK-NEXT: addq $392, %rsp # imm = 0x188
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll
index 4cdb079d43993..0c8d102066a1a 100644
--- a/llvm/test/CodeGen/X86/i64-mem-copy.ll
+++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll
@@ -123,42 +123,34 @@ define void @PR23476(<5 x i64> %in, ptr %out, i32 %index) nounwind {
;
; X86-LABEL: PR23476:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movl 52(%ebp), %eax
+; X86-NEXT: subl $64, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $7, %eax
-; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movups 8(%ebp), %xmm1
-; X86-NEXT: movups 24(%ebp), %xmm2
-; X86-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movaps %xmm1, (%esp)
-; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
+; X86-NEXT: movups %xmm1, (%esp)
+; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movsd %xmm0, (%ecx)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: addl $64, %esp
; X86-NEXT: retl
;
; X86AVX-LABEL: PR23476:
; X86AVX: # %bb.0:
-; X86AVX-NEXT: pushl %ebp
-; X86AVX-NEXT: movl %esp, %ebp
-; X86AVX-NEXT: andl $-32, %esp
-; X86AVX-NEXT: subl $96, %esp
+; X86AVX-NEXT: subl $64, %esp
; X86AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86AVX-NEXT: movl 52(%ebp), %eax
+; X86AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86AVX-NEXT: andl $7, %eax
-; X86AVX-NEXT: movl 48(%ebp), %ecx
-; X86AVX-NEXT: vmovups 8(%ebp), %ymm1
-; X86AVX-NEXT: vmovaps %ymm1, (%esp)
-; X86AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86AVX-NEXT: vmovups {{[0-9]+}}(%esp), %ymm1
+; X86AVX-NEXT: vmovups %ymm1, (%esp)
+; X86AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; X86AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86AVX-NEXT: vmovsd %xmm0, (%ecx)
-; X86AVX-NEXT: movl %ebp, %esp
-; X86AVX-NEXT: popl %ebp
+; X86AVX-NEXT: addl $64, %esp
; X86AVX-NEXT: vzeroupper
; X86AVX-NEXT: retl
%ext = extractelement <5 x i64> %in, i32 %index
diff --git a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
index 52d0c2b509128..629f44b52bc05 100644
--- a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
+++ b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
@@ -17,4 +17,22 @@ entry:
ret i32 %b
}
+define i32 @foo2(i32 %arg1) #1 {
+; CHECK-LABEL: foo2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $31, %edi
+; CHECK-NEXT: movzwl -72(%rsp,%rdi,2), %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %a = extractelement <32 x i16> zeroinitializer, i32 %arg1
+ %b = zext i16 %a to i32
+ ret i32 %b
+}
+
attributes #0 = { "no-realign-stack" "target-cpu"="skylake-avx512" }
+attributes #1 = { "no-realign-stack" "target-cpu"="skylake" }
diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
index 889ab6a0818e2..d7fe2376ed772 100644
--- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
@@ -15,28 +15,28 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-SSE-NEXT: movdqa 72(%ebp), %xmm0
; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE-NEXT: xorps %xmm1, %xmm1
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movaps %xmm1, (%esp)
-; X86-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: leal (%ecx,%ecx), %eax
; X86-SSE-NEXT: andl $31, %eax
-; X86-SSE-NEXT: movl 128(%esp,%eax,4), %eax
+; X86-SSE-NEXT: movl 136(%esp,%eax,4), %eax
; X86-SSE-NEXT: leal 1(%ecx,%ecx), %ecx
; X86-SSE-NEXT: andl $31, %ecx
-; X86-SSE-NEXT: movl (%esp,%ecx,4), %edx
+; X86-SSE-NEXT: movl 8(%esp,%ecx,4), %edx
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
@@ -69,20 +69,20 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-AVX-NEXT: movl 40(%ebp), %ecx
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %ymm1, (%esp)
-; X86-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: leal (%ecx,%ecx), %eax
; X86-AVX-NEXT: andl $31, %eax
-; X86-AVX-NEXT: movl 128(%esp,%eax,4), %eax
+; X86-AVX-NEXT: movl 152(%esp,%eax,4), %eax
; X86-AVX-NEXT: leal 1(%ecx,%ecx), %ecx
; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: movl (%esp,%ecx,4), %edx
+; X86-AVX-NEXT: movl 24(%esp,%ecx,4), %edx
; X86-AVX-NEXT: movl %ebp, %esp
; X86-AVX-NEXT: popl %ebp
; X86-AVX-NEXT: vzeroupper
@@ -90,22 +90,18 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
;
; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rbp
-; X64-AVX-NEXT: movq %rsp, %rbp
-; X64-AVX-NEXT: andq $-32, %rsp
-; X64-AVX-NEXT: subq $160, %rsp
+; X64-AVX-NEXT: pushq %rax
; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,3,3,3]
; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
-; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: andl $15, %edi
-; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax
-; X64-AVX-NEXT: movq %rbp, %rsp
-; X64-AVX-NEXT: popq %rbp
+; X64-AVX-NEXT: movq -128(%rsp,%rdi,8), %rax
+; X64-AVX-NEXT: popq %rcx
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
%1 = extractelement <16 x i64> %a0, i32 15
More information about the llvm-commits
mailing list