[llvm] [IA] Add support for [de]interleave{3,5,7} (PR #139373)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Sat May 10 04:25:33 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/139373
>From c45fca82fdbd8507e48e38c310514a03f0f373da Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 10 May 2025 18:12:58 +0800
Subject: [PATCH 1/2] Precommit tests
---
.../rvv/fixed-vectors-deinterleave-load.ll | 164 ++++++
.../rvv/fixed-vectors-interleave-store.ll | 458 +++++++++++++++++
.../RISCV/rvv/vector-deinterleave-load.ll | 120 +++++
.../RISCV/rvv/vector-interleave-store.ll | 339 +++++++++++++
.../RISCV/rvv/vp-vector-interleaved-access.ll | 476 ++++++++++++++++++
5 files changed, 1557 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index e53dfc23a84bb..df2a333eecd33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -257,6 +257,49 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
ret {<2 x double>, <2 x double>} %res1
}
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: vsetivli zero, 24, e8, m2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v8, 8
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 16
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v10
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <24 x i8>, ptr %p
+ %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+ %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+ %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+ %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
@@ -281,6 +324,127 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
}
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v8, 24
+; CHECK-NEXT: vslidedown.vi v14, v8, 16
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v13, v8, 8
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v14, v12, a0
+; CHECK-NEXT: vmv1r.v v12, v8
+; CHECK-NEXT: vslideup.vx v12, v13, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vmv1r.v v13, v14
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v14, v8
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg5e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <40 x i8>, ptr %p
+ %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec)
+ %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
+ %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
+ %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
+ %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
+ %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
+ %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
+ %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v8, a1
+; CHECK-NEXT: li a1, 48
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: vslidedown.vx v12, v8, a2
+; CHECK-NEXT: add a2, a0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 24
+; CHECK-NEXT: vslidedown.vi v18, v8, 16
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v13, v8, 8
+; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v18, v14, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v20, v8, a1
+; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v13, a0
+; CHECK-NEXT: vslideup.vx v12, v16, a0
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv2r.v v10, v12
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg7e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <56 x i8>, ptr %p
+ %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec)
+ %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
+ %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
+ %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
+ %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
+ %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
+ %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5
+ %t6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 6
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
+ %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
+ %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
+ %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t5, 5
+ %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t6, 6
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6
+}
+
define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 26c3db6131034..e4dac215b893a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -181,6 +181,138 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
ret void
}
+define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor3:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg3e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v9, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v10, (a1)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 6
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor3:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg3e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v9, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v10, (a1)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 6
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 3
+; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v8, (a1)
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vl1re32.v v12, (a2)
+; CHECK-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <12 x i32> @llvm.vector.interleave3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+ store <12 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
@@ -194,6 +326,332 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3
ret void
}
+define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor5:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a3, a1, a2
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg5e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v11, (a4)
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v12, (a4)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 10
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor5:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a3, a1, a2
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg5e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v11, (a4)
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v12, (a4)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 10
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 2
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 5 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg5e32.v v8, (a1)
+; CHECK-NEXT: add a2, a5, a2
+; CHECK-NEXT: vl1re32.v v10, (a5)
+; CHECK-NEXT: li a5, 32
+; CHECK-NEXT: vl1re32.v v12, (a4)
+; CHECK-NEXT: vl1re32.v v14, (a3)
+; CHECK-NEXT: vl1re32.v v8, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v10, 4
+; CHECK-NEXT: vslideup.vi v8, v14, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: vl1re32.v v16, (a2)
+; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: vsetivli zero, 20, e32, m8, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <20 x i32> @llvm.vector.interleave5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e)
+ store <20 x i32> %v, ptr %p
+ ret void
+}
+
+define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor7:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 4
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a3, a1, a2
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg7e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v11, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: add a1, a4, a2
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v12, (a4)
+; RV32-NEXT: vl1re32.v v13, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v14, (a1)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 14
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 4
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor7:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a3, a1, a2
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg7e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v11, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: add a1, a4, a2
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v12, (a4)
+; RV64-NEXT: vl1re32.v v13, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v14, (a1)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 14
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 4
+; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 7 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a1, a3
+; CHECK-NEXT: add a4, a2, a3
+; CHECK-NEXT: add a5, a4, a3
+; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg7e32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v14, (a5)
+; CHECK-NEXT: add a5, a5, a3
+; CHECK-NEXT: vl1re32.v v12, (a4)
+; CHECK-NEXT: add a4, a5, a3
+; CHECK-NEXT: add a3, a4, a3
+; CHECK-NEXT: vl1re32.v v10, (a4)
+; CHECK-NEXT: vl1re32.v v8, (a5)
+; CHECK-NEXT: vl1re32.v v16, (a3)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v16, 8
+; CHECK-NEXT: vl1re32.v v16, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v14, 4
+; CHECK-NEXT: vl1re32.v v14, (a2)
+; CHECK-NEXT: vslideup.vi v16, v14, 4
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v12, 8
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v8, 16
+; CHECK-NEXT: vsetivli zero, 28, e32, m8, ta, ma
+; CHECK-NEXT: vse32.v v16, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <28 x i32> @llvm.vector.interleave7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g)
+ store <28 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 582aef908964a..be8deb1319c36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -344,6 +344,42 @@ define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @vector_deinterleave_load_nxv2p0
ret {<vscale x 2 x ptr>, <vscale x 2 x ptr>} %res1
}
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 24 x i8>, ptr %p
+ %d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave3(<vscale x 24 x i8> %vec)
+ %t0 = extractvalue {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %d0, 0
+ %t1 = extractvalue {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %d0, 1
+ %t2 = extractvalue {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %d0, 2
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 0
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 0
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2
+}
+
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
@@ -368,6 +404,90 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
}
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 2
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg5e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 40 x i8>, ptr %p
+ %d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave5(<vscale x 40 x i8> %vec)
+ %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 0
+ %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 1
+ %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 2
+ %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 3
+ %t4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 4
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
+ %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
+ %res4 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t4, 4
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res4
+}
+
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: sub a2, a2, a1
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg7e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 56 x i8>, ptr %p
+ %d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave7(<vscale x 56 x i8> %vec)
+ %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 0
+ %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 1
+ %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 2
+ %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 3
+ %t4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 4
+ %t5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 5
+ %t6 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 6
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
+ %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
+ %res4 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t4, 4
+ %res5 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t5, 5
+ %res6 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t6, 6
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6
+}
+
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index b5eb312bf5e18..eeb0e9e91ed36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -239,6 +239,107 @@ define void @vector_interleave_store_nxv4p0_nxv2p0(<vscale x 2 x ptr> %a, <vscal
ret void
}
+define void @vector_interleave_store_factor3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor3:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg3e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v9, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v10, (a1)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 6
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor3:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg3e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v9, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v10, (a1)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 6
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 3
+; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
+ store <vscale x 6 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor4(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
@@ -252,6 +353,244 @@ define void @vector_interleave_store_factor4(<vscale x 4 x i32> %a, <vscale x 4
ret void
}
+define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor5:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a3, a1, a2
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg5e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v11, (a4)
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v12, (a4)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 10
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor5:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a3, a1, a2
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg5e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v11, (a4)
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v12, (a4)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 10
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %v = call <vscale x 10 x i32> @llvm.vector.interleave5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e)
+ store <vscale x 10 x i32> %v, ptr %p
+ ret void
+}
+
+define void @vector_interleave_store_factor7(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor7:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 4
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a3, a1, a2
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg7e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v11, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: add a1, a4, a2
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v12, (a4)
+; RV32-NEXT: vl1re32.v v13, (a1)
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: vl1re32.v v14, (a1)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 14
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 4
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor7:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a3, a1, a2
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg7e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v11, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: add a1, a4, a2
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v12, (a4)
+; RV64-NEXT: vl1re32.v v13, (a1)
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: vl1re32.v v14, (a1)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 14
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 4
+; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %v = call <vscale x 14 x i32> @llvm.vector.interleave7(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g)
+ store <vscale x 14 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index d0f35aa8b85e9..97fae479e0cb6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -28,6 +28,68 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
}
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs4r.v v8, (a0)
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vlseg3e32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor3_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs4r.v v8, (a0)
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vlseg3e32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 3
+ %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor4_v2:
; RV32: # %bb.0:
@@ -63,6 +125,142 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
}
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor5_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor5_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vlseg5e32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor5_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vlseg5e32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 5
+ %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 4
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor7_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor7_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a2, a2, a1
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vlseg7e32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor7_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: subw a2, a2, a1
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: srli a2, a2, 32
+; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vlseg7e32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 7
+ %wide.masked.load = call <vscale x 14 x i32> @llvm.vp.load(ptr %ptr, <vscale x 14 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave7(<vscale x 14 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 4
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 5
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 6
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+ %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6
+}
+
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor8_v2:
; RV32: # %bb.0:
@@ -137,6 +335,84 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
ret void
}
+define void @store_factor3_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor3_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; RV32-NEXT: vsseg3e32.v v8, (a2)
+; RV32-NEXT: add a5, a2, a4
+; RV32-NEXT: vle32.v v9, (a5)
+; RV32-NEXT: vle32.v v8, (a2)
+; RV32-NEXT: srli a3, a3, 3
+; RV32-NEXT: add a2, a3, a3
+; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v8, v9, a3
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v9, (a4)
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor3_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a4, a3, 1
+; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; RV64-NEXT: vsseg3e32.v v8, (a2)
+; RV64-NEXT: add a5, a2, a4
+; RV64-NEXT: vle32.v v9, (a5)
+; RV64-NEXT: vle32.v v8, (a2)
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: srli a3, a3, 3
+; RV64-NEXT: add a6, a3, a3
+; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v8, v9, a3
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV64-NEXT: vle32.v v9, (a4)
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 3
+ %interleaved.vec = call <vscale x 3 x i32> @llvm.vector.interleave3(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2)
+ call void @llvm.vp.store(<vscale x 3 x i32> %interleaved.vec, ptr %ptr, <vscale x 3 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
; RV32-LABEL: store_factor4_v2:
; RV32: # %bb.0:
@@ -165,6 +441,206 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
ret void
}
+define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor5_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a3, a2, 1
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: add a5, a2, a4
+; RV32-NEXT: add a6, a5, a4
+; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; RV32-NEXT: vsseg5e32.v v8, (a2)
+; RV32-NEXT: add a7, a6, a4
+; RV32-NEXT: vle32.v v8, (a7)
+; RV32-NEXT: vle32.v v9, (a6)
+; RV32-NEXT: srli a3, a3, 3
+; RV32-NEXT: add a6, a3, a3
+; RV32-NEXT: vle32.v v10, (a5)
+; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v9, v8, a3
+; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a2)
+; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v8, v10, a3
+; RV32-NEXT: add a4, a7, a4
+; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v10, (a4)
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor5_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a3, a2, 1
+; RV64-NEXT: add a2, a3, a2
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a4, a3, 1
+; RV64-NEXT: add a5, a2, a4
+; RV64-NEXT: add a6, a5, a4
+; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; RV64-NEXT: vsseg5e32.v v8, (a2)
+; RV64-NEXT: add a7, a6, a4
+; RV64-NEXT: vle32.v v8, (a7)
+; RV64-NEXT: vle32.v v9, (a6)
+; RV64-NEXT: srli a3, a3, 3
+; RV64-NEXT: add a6, a3, a3
+; RV64-NEXT: vle32.v v10, (a5)
+; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v9, v8, a3
+; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; RV64-NEXT: vle32.v v8, (a2)
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v8, v10, a3
+; RV64-NEXT: add a4, a7, a4
+; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; RV64-NEXT: vle32.v v10, (a4)
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 5
+ %interleaved.vec = call <vscale x 5 x i32> @llvm.vector.interleave5(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4)
+ call void @llvm.vp.store(<vscale x 5 x i32> %interleaved.vec, ptr %ptr, <vscale x 5 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
+define void @store_factor7_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor7_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: srli a3, a3, 3
+; RV32-NEXT: add a5, a2, a4
+; RV32-NEXT: add a6, a5, a4
+; RV32-NEXT: add a7, a6, a4
+; RV32-NEXT: add t0, a7, a4
+; RV32-NEXT: vsetvli t1, zero, e32, mf2, ta, ma
+; RV32-NEXT: vsseg7e32.v v8, (a2)
+; RV32-NEXT: add t1, t0, a4
+; RV32-NEXT: vle32.v v8, (t1)
+; RV32-NEXT: vle32.v v10, (t0)
+; RV32-NEXT: add t0, a3, a3
+; RV32-NEXT: add a4, t1, a4
+; RV32-NEXT: vle32.v v12, (a7)
+; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v10, v8, a3
+; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v11, (a4)
+; RV32-NEXT: vle32.v v9, (a6)
+; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v9, v12, a3
+; RV32-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v12, (a5)
+; RV32-NEXT: vle32.v v8, (a2)
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a2, a2, a1
+; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v8, v12, a3
+; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor7_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a4, a3, 1
+; RV64-NEXT: srli a3, a3, 3
+; RV64-NEXT: add a5, a2, a4
+; RV64-NEXT: add a6, a5, a4
+; RV64-NEXT: add a7, a6, a4
+; RV64-NEXT: add t0, a7, a4
+; RV64-NEXT: vsetvli t1, zero, e32, mf2, ta, ma
+; RV64-NEXT: vsseg7e32.v v8, (a2)
+; RV64-NEXT: add t1, t0, a4
+; RV64-NEXT: vle32.v v8, (t1)
+; RV64-NEXT: vle32.v v10, (t0)
+; RV64-NEXT: add t0, a3, a3
+; RV64-NEXT: add a4, t1, a4
+; RV64-NEXT: vle32.v v12, (a7)
+; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v10, v8, a3
+; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; RV64-NEXT: vle32.v v11, (a4)
+; RV64-NEXT: vle32.v v9, (a6)
+; RV64-NEXT: vle32.v v13, (a5)
+; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v9, v12, a3
+; RV64-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; RV64-NEXT: vle32.v v8, (a2)
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: subw a2, a2, a1
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v8, v13, a3
+; RV64-NEXT: srli a2, a2, 32
+; RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 7
+ %interleaved.vec = call <vscale x 7 x i32> @llvm.vector.interleave7(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6)
+ call void @llvm.vp.store(<vscale x 7 x i32> %interleaved.vec, ptr %ptr, <vscale x 7 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
; RV32-LABEL: store_factor8_v2:
; RV32: # %bb.0:
>From a6b4b02440dff9d680f4662fae49355fbb69669f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 10 May 2025 18:39:14 +0800
Subject: [PATCH 2/2] [IA] Support [de]interleave{3,5,7}
---
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 75 ++-
.../rvv/fixed-vectors-deinterleave-load.ll | 107 +----
.../rvv/fixed-vectors-interleave-store.ll | 437 +-----------------
.../RISCV/rvv/vector-deinterleave-load.ll | 57 ---
.../RISCV/rvv/vector-interleave-store.ll | 336 +-------------
.../RISCV/rvv/vp-vector-interleaved-access.ll | 416 ++++-------------
6 files changed, 168 insertions(+), 1260 deletions(-)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 04d89d61cb6a9..c590e470fa779 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -571,6 +571,25 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
return true;
}
+static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_deinterleave2:
+ case Intrinsic::vector_interleave2:
+ return 2;
+ case Intrinsic::vector_deinterleave3:
+ case Intrinsic::vector_interleave3:
+ return 3;
+ case Intrinsic::vector_deinterleave5:
+ case Intrinsic::vector_interleave5:
+ return 5;
+ case Intrinsic::vector_deinterleave7:
+ case Intrinsic::vector_interleave7:
+ return 7;
+ default:
+ llvm_unreachable("Unexpected intrinsic");
+ }
+}
+
// For an (de)interleave tree like this:
//
// A C B D
@@ -586,7 +605,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
// to reorder them by interleaving these values.
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
unsigned NumLeaves = SubLeaves.size();
- if (NumLeaves == 2)
+ if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves))
return;
assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
@@ -608,7 +627,10 @@ static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
static bool
getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
SmallVectorImpl<Instruction *> &DeadInsts) {
- assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
+ assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave3 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave5 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave7);
// Visit with BFS
SmallVector<IntrinsicInst *, 8> Queue;
@@ -620,7 +642,7 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
// All the intermediate intrinsics will be deleted.
DeadInsts.push_back(Current);
- for (unsigned I = 0; I < 2; ++I) {
+ for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) {
Value *Op = Current->getOperand(I);
if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
@@ -638,9 +660,10 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
}
const unsigned Factor = Operands.size();
- // Currently we only recognize power-of-two factors.
+ // Currently we only recognize factors of 2, 3, 5 and 7.
// FIXME: should we assert here instead?
- if (Factor <= 1 || !isPowerOf2_32(Factor))
+ if (Factor <= 1 ||
+ (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
return false;
interleaveLeafValues(Operands);
@@ -651,9 +674,12 @@ static bool
getVectorDeinterleaveFactor(IntrinsicInst *II,
SmallVectorImpl<Value *> &Results,
SmallVectorImpl<Instruction *> &DeadInsts) {
- assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
+ assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave3 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave5 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave7);
using namespace PatternMatch;
- if (!II->hasNUses(2))
+ if (!II->hasNUses(getIntrinsicFactor(II)))
return false;
// Visit with BFS
@@ -662,12 +688,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
while (!Queue.empty()) {
IntrinsicInst *Current = Queue.front();
Queue.erase(Queue.begin());
- assert(Current->hasNUses(2));
+ assert(Current->hasNUses(getIntrinsicFactor(Current)));
// All the intermediate intrinsics will be deleted from the bottom-up.
DeadInsts.insert(DeadInsts.begin(), Current);
- ExtractValueInst *LHS = nullptr, *RHS = nullptr;
+ SmallVector<ExtractValueInst *> EVs(getIntrinsicFactor(Current), nullptr);
for (User *Usr : Current->users()) {
if (!isa<ExtractValueInst>(Usr))
return 0;
@@ -679,17 +705,15 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
if (Indices.size() != 1)
return false;
- if (Indices[0] == 0 && !LHS)
- LHS = EV;
- else if (Indices[0] == 1 && !RHS)
- RHS = EV;
+ if (!EVs[Indices[0]])
+ EVs[Indices[0]] = EV;
else
return false;
}
// We have legal indices. At this point we're either going
// to continue the traversal or push the leaf values into Results.
- for (ExtractValueInst *EV : {LHS, RHS}) {
+ for (ExtractValueInst *EV : EVs) {
// Continue the traversal. We're playing safe here and matching only the
// expression consisting of a perfectly balanced binary tree in which all
// intermediate values are only used once.
@@ -713,9 +737,10 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
}
const unsigned Factor = Results.size();
- // Currently we only recognize power-of-two factors.
+ // Currently we only recognize factors of 2, 3, 5 and 7.
// FIXME: should we assert here instead?
- if (Factor <= 1 || !isPowerOf2_32(Factor))
+ if (Factor <= 1 ||
+ (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
return 0;
interleaveLeafValues(Results);
@@ -878,11 +903,23 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
- // with a factor of 2.
- if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
+ // with a factor of 2,3,5 and 7.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_deinterleave2:
+ case Intrinsic::vector_deinterleave3:
+ case Intrinsic::vector_deinterleave5:
+ case Intrinsic::vector_deinterleave7:
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+ break;
+ case Intrinsic::vector_interleave2:
+ case Intrinsic::vector_interleave3:
+ case Intrinsic::vector_interleave5:
+ case Intrinsic::vector_interleave7:
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
+ break;
+ default:
+ break;
+ }
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index df2a333eecd33..31529b1783651 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -260,34 +260,8 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor3:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; CHECK-NEXT: vsetivli zero, 24, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 1
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v12, v8, 8
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 16
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v12, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v9, v10
-; CHECK-NEXT: vs2r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg3e8.v v6, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <24 x i8>, ptr %p
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
@@ -327,42 +301,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor5:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 1
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v12, v8, 24
-; CHECK-NEXT: vslidedown.vi v14, v8, 16
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v13, v8, 8
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v14, v12, a0
-; CHECK-NEXT: vmv1r.v v12, v8
-; CHECK-NEXT: vslideup.vx v12, v13, a0
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
-; CHECK-NEXT: vmv1r.v v13, v14
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv2r.v v14, v8
-; CHECK-NEXT: vs4r.v v12, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg5e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <40 x i8>, ptr %p
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec)
@@ -382,49 +322,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor7:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: li a1, 56
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: li a2, 32
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a1
-; CHECK-NEXT: li a1, 48
-; CHECK-NEXT: srli a0, a0, 1
-; CHECK-NEXT: vslidedown.vx v12, v8, a2
-; CHECK-NEXT: add a2, a0, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v14, v8, 24
-; CHECK-NEXT: vslidedown.vi v18, v8, 16
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v13, v8, 8
-; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v18, v14, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v20, v8, a1
-; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v13, a0
-; CHECK-NEXT: vslideup.vx v12, v16, a0
-; CHECK-NEXT: vmv1r.v v9, v18
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v13, v20
-; CHECK-NEXT: vmv2r.v v10, v12
-; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg7e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <56 x i8>, ptr %p
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index e4dac215b893a..8244db45a7ef2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -182,131 +182,10 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
}
define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor3:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg3e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v9, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v10, (a1)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 6
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 3
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor3:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg3e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v9, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v10, (a1)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 6
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 3
-; RV64-NEXT: sub a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
; CHECK-LABEL: vector_interleave_store_factor3:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg3e32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v8, (a1)
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: vl1re32.v v10, (a3)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
-; CHECK-NEXT: vl1re32.v v12, (a2)
-; CHECK-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 8
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <12 x i32> @llvm.vector.interleave3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
store <12 x i32> %v, ptr %p
@@ -327,160 +206,10 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3
}
define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor5:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a3, a1, a2
-; RV32-NEXT: add a4, a3, a2
-; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg5e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v10, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v11, (a4)
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v12, (a4)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 10
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor5:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a3, a1, a2
-; RV64-NEXT: add a4, a3, a2
-; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg5e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v10, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v11, (a4)
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v12, (a4)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 10
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
; CHECK-LABEL: vector_interleave_store_factor5:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 2
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 5 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg5e32.v v8, (a1)
-; CHECK-NEXT: add a2, a5, a2
-; CHECK-NEXT: vl1re32.v v10, (a5)
-; CHECK-NEXT: li a5, 32
-; CHECK-NEXT: vl1re32.v v12, (a4)
-; CHECK-NEXT: vl1re32.v v14, (a3)
-; CHECK-NEXT: vl1re32.v v8, (a1)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v12, v10, 4
-; CHECK-NEXT: vslideup.vi v8, v14, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 8
-; CHECK-NEXT: vl1re32.v v16, (a2)
-; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: vsetivli zero, 20, e32, m8, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg5e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <20 x i32> @llvm.vector.interleave5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e)
store <20 x i32> %v, ptr %p
@@ -488,164 +217,10 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3
}
define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor7:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 4
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a3, a1, a2
-; RV32-NEXT: add a4, a3, a2
-; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg7e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v10, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v11, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: add a1, a4, a2
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v12, (a4)
-; RV32-NEXT: vl1re32.v v13, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v14, (a1)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 14
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 4
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor7:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: sub a1, a2, a1
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a3, a1, a2
-; RV64-NEXT: add a4, a3, a2
-; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg7e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v10, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v11, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: add a1, a4, a2
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v12, (a4)
-; RV64-NEXT: vl1re32.v v13, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v14, (a1)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 14
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 4
-; RV64-NEXT: sub a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
; CHECK-LABEL: vector_interleave_store_factor7:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 3
-; CHECK-NEXT: sub a1, a2, a1
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 7 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: add a2, a1, a3
-; CHECK-NEXT: add a4, a2, a3
-; CHECK-NEXT: add a5, a4, a3
-; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg7e32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v14, (a5)
-; CHECK-NEXT: add a5, a5, a3
-; CHECK-NEXT: vl1re32.v v12, (a4)
-; CHECK-NEXT: add a4, a5, a3
-; CHECK-NEXT: add a3, a4, a3
-; CHECK-NEXT: vl1re32.v v10, (a4)
-; CHECK-NEXT: vl1re32.v v8, (a5)
-; CHECK-NEXT: vl1re32.v v16, (a3)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 8
-; CHECK-NEXT: vl1re32.v v16, (a1)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v12, v14, 4
-; CHECK-NEXT: vl1re32.v v14, (a2)
-; CHECK-NEXT: vslideup.vi v16, v14, 4
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v16, v12, 8
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vslideup.vi v16, v8, 16
-; CHECK-NEXT: vsetivli zero, 28, e32, m8, ta, ma
-; CHECK-NEXT: vse32.v v16, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg7e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <28 x i32> @llvm.vector.interleave7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g)
store <28 x i32> %v, ptr %p
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index be8deb1319c36..0483bbbd35b39 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -347,27 +347,8 @@ define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @vector_deinterleave_load_nxv2p0
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor3:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg3e8.v v6, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 24 x i8>, ptr %p
%d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave3(<vscale x 24 x i8> %vec)
@@ -407,27 +388,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor5:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 2
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg5e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 40 x i8>, ptr %p
%d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave5(<vscale x 40 x i8> %vec)
@@ -447,27 +409,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor7:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 3
-; CHECK-NEXT: sub a2, a2, a1
-; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg7e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 56 x i8>, ptr %p
%d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave7(<vscale x 56 x i8> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index eeb0e9e91ed36..4332ca411d91b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -240,101 +240,11 @@ define void @vector_interleave_store_nxv4p0_nxv2p0(<vscale x 2 x ptr> %a, <vscal
}
define void @vector_interleave_store_factor3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor3:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg3e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v9, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v10, (a1)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 6
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 3
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor3:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg3e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v9, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v10, (a1)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 6
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 3
-; RV64-NEXT: sub a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: ret
%v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
store <vscale x 6 x i32> %v, ptr %p
ret void
@@ -354,238 +264,22 @@ define void @vector_interleave_store_factor4(<vscale x 4 x i32> %a, <vscale x 4
}
define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor5:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a3, a1, a2
-; RV32-NEXT: add a4, a3, a2
-; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg5e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v10, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v11, (a4)
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v12, (a4)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 10
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor5:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a3, a1, a2
-; RV64-NEXT: add a4, a3, a2
-; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg5e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v10, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v11, (a4)
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v12, (a4)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 10
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg5e32.v v8, (a0)
+; CHECK-NEXT: ret
%v = call <vscale x 10 x i32> @llvm.vector.interleave5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e)
store <vscale x 10 x i32> %v, ptr %p
ret void
}
define void @vector_interleave_store_factor7(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor7:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 4
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a3, a1, a2
-; RV32-NEXT: add a4, a3, a2
-; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg7e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v10, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v11, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: add a1, a4, a2
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v12, (a4)
-; RV32-NEXT: vl1re32.v v13, (a1)
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: vl1re32.v v14, (a1)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 14
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 4
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor7:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: sub a1, a2, a1
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a3, a1, a2
-; RV64-NEXT: add a4, a3, a2
-; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg7e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v10, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v11, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: add a1, a4, a2
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v12, (a4)
-; RV64-NEXT: vl1re32.v v13, (a1)
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: vl1re32.v v14, (a1)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 14
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 4
-; RV64-NEXT: sub a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg7e32.v v8, (a0)
+; CHECK-NEXT: ret
%v = call <vscale x 14 x i32> @llvm.vector.interleave7(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g)
store <vscale x 14 x i32> %v, ptr %p
ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 97fae479e0cb6..142ee5256f9e7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -31,52 +31,28 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_v2(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor3_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; RV32-NEXT: slli a2, a1, 1
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs4r.v v8, (a0)
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: lui a2, 699051
+; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV32-NEXT: vlseg3e32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: load_factor3_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 699051
+; RV64-NEXT: addi a2, a2, -1365
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs4r.v v8, (a0)
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV64-NEXT: vlseg3e32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%rvl = mul i32 %evl, 3
%wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
@@ -128,52 +104,28 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor5_v2(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor5_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: slli a2, a1, 2
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: lui a2, 838861
+; RV32-NEXT: addi a2, a2, -819
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV32-NEXT: vlseg5e32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: load_factor5_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV64-NEXT: slli a2, a1, 2
; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 838861
+; RV64-NEXT: addi a2, a2, -819
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV64-NEXT: vlseg5e32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%rvl = mul i32 %evl, 5
%wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
@@ -194,52 +146,35 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor7_v2(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor7_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: sub a2, a2, a1
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: lui a1, 149797
+; RV32-NEXT: addi a1, a1, -1755
+; RV32-NEXT: mulhu a1, a2, a1
+; RV32-NEXT: sub a2, a2, a1
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV32-NEXT: vlseg7e32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: load_factor7_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: lui a3, 149797
; RV64-NEXT: subw a2, a2, a1
-; RV64-NEXT: slli a2, a2, 32
-; RV64-NEXT: srli a2, a2, 32
-; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: addi a1, a3, -1755
+; RV64-NEXT: slli a3, a2, 32
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: mulhu a1, a3, a1
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: subw a2, a2, a1
+; RV64-NEXT: srliw a2, a2, 1
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV64-NEXT: vlseg7e32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%rvl = mul i32 %evl, 7
%wide.masked.load = call <vscale x 14 x i32> @llvm.vp.load(ptr %ptr, <vscale x 14 x i1> splat (i1 true), i32 %rvl)
@@ -338,74 +273,28 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
define void @store_factor3_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, ptr %ptr, i32 %evl) {
; RV32-LABEL: store_factor3_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: srli a4, a3, 1
-; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; RV32-NEXT: vsseg3e32.v v8, (a2)
-; RV32-NEXT: add a5, a2, a4
-; RV32-NEXT: vle32.v v9, (a5)
-; RV32-NEXT: vle32.v v8, (a2)
-; RV32-NEXT: srli a3, a3, 3
-; RV32-NEXT: add a2, a3, a3
-; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v8, v9, a3
-; RV32-NEXT: add a4, a5, a4
-; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; RV32-NEXT: vle32.v v9, (a4)
; RV32-NEXT: slli a2, a1, 1
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: lui a2, 699051
+; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg3e32.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: store_factor3_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: srli a4, a3, 1
-; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; RV64-NEXT: vsseg3e32.v v8, (a2)
-; RV64-NEXT: add a5, a2, a4
-; RV64-NEXT: vle32.v v9, (a5)
-; RV64-NEXT: vle32.v v8, (a2)
; RV64-NEXT: slli a2, a1, 1
-; RV64-NEXT: srli a3, a3, 3
-; RV64-NEXT: add a6, a3, a3
-; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v8, v9, a3
-; RV64-NEXT: add a4, a5, a4
-; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; RV64-NEXT: vle32.v v9, (a4)
; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 699051
+; RV64-NEXT: addi a2, a2, -1365
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg3e32.v v8, (a0)
; RV64-NEXT: ret
%rvl = mul i32 %evl, 3
%interleaved.vec = call <vscale x 3 x i32> @llvm.vector.interleave3(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2)
@@ -444,92 +333,28 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, ptr %ptr, i32 %evl) {
; RV32-LABEL: store_factor5_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a3, a2, 1
-; RV32-NEXT: add a2, a3, a2
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: srli a4, a3, 1
-; RV32-NEXT: add a5, a2, a4
-; RV32-NEXT: add a6, a5, a4
-; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; RV32-NEXT: vsseg5e32.v v8, (a2)
-; RV32-NEXT: add a7, a6, a4
-; RV32-NEXT: vle32.v v8, (a7)
-; RV32-NEXT: vle32.v v9, (a6)
-; RV32-NEXT: srli a3, a3, 3
-; RV32-NEXT: add a6, a3, a3
-; RV32-NEXT: vle32.v v10, (a5)
-; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v9, v8, a3
-; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; RV32-NEXT: vle32.v v8, (a2)
-; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v8, v10, a3
-; RV32-NEXT: add a4, a7, a4
-; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; RV32-NEXT: vle32.v v10, (a4)
; RV32-NEXT: slli a2, a1, 2
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: lui a2, 838861
+; RV32-NEXT: addi a2, a2, -819
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg5e32.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: store_factor5_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 1
-; RV64-NEXT: add a2, a3, a2
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: srli a4, a3, 1
-; RV64-NEXT: add a5, a2, a4
-; RV64-NEXT: add a6, a5, a4
-; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; RV64-NEXT: vsseg5e32.v v8, (a2)
-; RV64-NEXT: add a7, a6, a4
-; RV64-NEXT: vle32.v v8, (a7)
-; RV64-NEXT: vle32.v v9, (a6)
-; RV64-NEXT: srli a3, a3, 3
-; RV64-NEXT: add a6, a3, a3
-; RV64-NEXT: vle32.v v10, (a5)
-; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v9, v8, a3
-; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; RV64-NEXT: vle32.v v8, (a2)
; RV64-NEXT: slli a2, a1, 2
-; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v8, v10, a3
-; RV64-NEXT: add a4, a7, a4
-; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; RV64-NEXT: vle32.v v10, (a4)
; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 838861
+; RV64-NEXT: addi a2, a2, -819
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 1
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg5e32.v v8, (a0)
; RV64-NEXT: ret
%rvl = mul i32 %evl, 5
%interleaved.vec = call <vscale x 5 x i32> @llvm.vector.interleave5(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4)
@@ -540,100 +365,35 @@ define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <v
define void @store_factor7_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6, ptr %ptr, i32 %evl) {
; RV32-LABEL: store_factor7_v2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: srli a4, a3, 1
-; RV32-NEXT: srli a3, a3, 3
-; RV32-NEXT: add a5, a2, a4
-; RV32-NEXT: add a6, a5, a4
-; RV32-NEXT: add a7, a6, a4
-; RV32-NEXT: add t0, a7, a4
-; RV32-NEXT: vsetvli t1, zero, e32, mf2, ta, ma
-; RV32-NEXT: vsseg7e32.v v8, (a2)
-; RV32-NEXT: add t1, t0, a4
-; RV32-NEXT: vle32.v v8, (t1)
-; RV32-NEXT: vle32.v v10, (t0)
-; RV32-NEXT: add t0, a3, a3
-; RV32-NEXT: add a4, t1, a4
-; RV32-NEXT: vle32.v v12, (a7)
-; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v10, v8, a3
-; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; RV32-NEXT: vle32.v v11, (a4)
-; RV32-NEXT: vle32.v v9, (a6)
-; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v9, v12, a3
-; RV32-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
-; RV32-NEXT: vle32.v v12, (a5)
-; RV32-NEXT: vle32.v v8, (a2)
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: sub a2, a2, a1
-; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV32-NEXT: vslideup.vx v8, v12, a3
-; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: lui a1, 149797
+; RV32-NEXT: addi a1, a1, -1755
+; RV32-NEXT: mulhu a1, a2, a1
+; RV32-NEXT: sub a2, a2, a1
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg7e32.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: store_factor7_v2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: srli a4, a3, 1
-; RV64-NEXT: srli a3, a3, 3
-; RV64-NEXT: add a5, a2, a4
-; RV64-NEXT: add a6, a5, a4
-; RV64-NEXT: add a7, a6, a4
-; RV64-NEXT: add t0, a7, a4
-; RV64-NEXT: vsetvli t1, zero, e32, mf2, ta, ma
-; RV64-NEXT: vsseg7e32.v v8, (a2)
-; RV64-NEXT: add t1, t0, a4
-; RV64-NEXT: vle32.v v8, (t1)
-; RV64-NEXT: vle32.v v10, (t0)
-; RV64-NEXT: add t0, a3, a3
-; RV64-NEXT: add a4, t1, a4
-; RV64-NEXT: vle32.v v12, (a7)
-; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v10, v8, a3
-; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; RV64-NEXT: vle32.v v11, (a4)
-; RV64-NEXT: vle32.v v9, (a6)
-; RV64-NEXT: vle32.v v13, (a5)
-; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v9, v12, a3
-; RV64-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
-; RV64-NEXT: vle32.v v8, (a2)
; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: lui a3, 149797
; RV64-NEXT: subw a2, a2, a1
-; RV64-NEXT: slli a2, a2, 32
-; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma
-; RV64-NEXT: vslideup.vx v8, v13, a3
-; RV64-NEXT: srli a2, a2, 32
-; RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: addi a1, a3, -1755
+; RV64-NEXT: slli a3, a2, 32
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: mulhu a1, a3, a1
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: subw a2, a2, a1
+; RV64-NEXT: srliw a2, a2, 1
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg7e32.v v8, (a0)
; RV64-NEXT: ret
%rvl = mul i32 %evl, 7
%interleaved.vec = call <vscale x 7 x i32> @llvm.vector.interleave7(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6)
More information about the llvm-commits
mailing list