[llvm] [IA] Add support for [de]interleave{4,6,8} (PR #141512)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon May 26 11:39:40 PDT 2025
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/141512
This teaches the interleaved access pass to the lower the intrinsics for factors 4,6 and 8 added in #139893 to target intrinsics.
Because factors 4 and 8 could either have been recursively [de]interleaved or have just been a single intrinsic, we need to check that it's the former it before reshuffling around the values via interleaveLeafValues.
After this patch, we can teach the loop vectorizer to emit a single interleave intrinsic for factors 2 through to 8, and then we can remove the recursive interleaving matching in interleaved access pass.
>From d21307f84fe70a1cf7153d7d2d83dc16b52950eb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 May 2025 19:22:31 +0100
Subject: [PATCH 1/2] Precommit tests
---
.../rvv/fixed-vectors-deinterleave-load.ll | 191 +++++++++++++++-
.../rvv/fixed-vectors-interleave-store.ll | 148 +++++++++++++
.../RISCV/rvv/vector-deinterleave-load.ll | 127 ++++++++++-
.../RISCV/rvv/vector-interleave-store.ll | 207 +++++++++++++++++-
.../RISCV/interleaved-accesses.ll | 166 +++++++++++++-
5 files changed, 826 insertions(+), 13 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 31529b1783651..6b6c64b54956b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -277,6 +277,55 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 24
+; CHECK-NEXT: vslidedown.vi v12, v8, 16
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 8
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v12, v10, a0
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv.v.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <32 x i8>, ptr %p
+ %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave4(<32 x i8> %vec)
+ %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
+ %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
+ %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
+ %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
+ %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: ret
@@ -319,6 +368,67 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4
}
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor6(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: li a1, 48
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: li a0, 40
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: srli a2, a2, 1
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v12, v8, a0
+; CHECK-NEXT: add a0, a2, a2
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 24
+; CHECK-NEXT: vslidedown.vi v20, v8, 16
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v13, v8, 8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v20, v14, a2
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v8, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v13, a2
+; CHECK-NEXT: vslideup.vx v16, v12, a2
+; CHECK-NEXT: vmv1r.v v9, v20
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v10, v16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg6e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <48 x i8>, ptr %p
+ %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave6(<48 x i8> %vec)
+ %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
+ %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
+ %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
+ %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
+ %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
+ %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
+ %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
+ %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
+ %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5
+}
+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor7:
; CHECK: # %bb.0:
@@ -339,14 +449,89 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
%res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
%res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
%res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
- %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t5, 5
- %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t6, 6
+ %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5
+ %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5, <8 x i8> %t6, 6
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6
}
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) {
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor8(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: li a0, 56
+; CHECK-NEXT: li a1, 48
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v8, a0
+; CHECK-NEXT: li a0, 40
+; CHECK-NEXT: vslidedown.vx v12, v8, a1
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: srli a2, a2, 1
+; CHECK-NEXT: vslidedown.vx v20, v8, a0
+; CHECK-NEXT: add a0, a2, a2
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 24
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v12, v16, a2
+; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v22, v8, 16
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v22, v14, a2
+; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v8, a1
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a2
+; CHECK-NEXT: vslideup.vx v16, v20, a2
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v17, v12
+; CHECK-NEXT: vmv2r.v v10, v16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg8e8.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v15, v14
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <64 x i8>, ptr %p
+ %d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave8(<64 x i8> %vec)
+ %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
+ %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
+ %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
+ %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
+ %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
+ %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5
+ %t6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 6
+ %t7 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 7
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
+ %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
+ %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
+ %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4, <8 x i8> %t5, 5
+ %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res5, <8 x i8> %t6, 6
+ %res7 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6, <8 x i8> %t6, 7
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res7
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
+; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vlseg8e32.v v8, (a0)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 8244db45a7ef2..6497675bd56f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -195,6 +195,45 @@ define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i3
define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a1)
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vl1re32.v v8, (a4)
+; CHECK-NEXT: vl1re32.v v10, (a2)
+; CHECK-NEXT: vl1re32.v v12, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vslideup.vi v12, v10, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v8, 8
+; CHECK-NEXT: vse32.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <16 x i32> @llvm.vector.interleave4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d)
+ store <16 x i32> %v, ptr %p
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @vector_interleave_store_factor4_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
+; CHECK-LABEL: vector_interleave_store_factor4_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsseg4e32.v v8, (a0)
; CHECK-NEXT: ret
@@ -216,6 +255,60 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3
ret void
}
+define void @vector_interleave_store_factor6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, ptr %p) {
+; CHECK-LABEL: vector_interleave_store_factor6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 32
+; CHECK-NEXT: add a4, a1, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg6e32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v12, (a5)
+; CHECK-NEXT: add a5, a5, a2
+; CHECK-NEXT: vl1re32.v v10, (a5)
+; CHECK-NEXT: add a5, a5, a2
+; CHECK-NEXT: vl1re32.v v14, (a4)
+; CHECK-NEXT: vl1re32.v v8, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v10, 4
+; CHECK-NEXT: vslideup.vi v8, v14, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: vl1re32.v v16, (a5)
+; CHECK-NEXT: add a2, a5, a2
+; CHECK-NEXT: vl1re32.v v12, (a2)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v12, 4
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: vsetivli zero, 24, e32, m8, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <24 x i32> @llvm.vector.interleave6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f)
+ store <24 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor7:
; CHECK: # %bb.0:
@@ -230,6 +323,61 @@ define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i3
define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a1, a3
+; CHECK-NEXT: add a4, a2, a3
+; CHECK-NEXT: add a5, a4, a3
+; CHECK-NEXT: add a6, a5, a3
+; CHECK-NEXT: add a7, a6, a3
+; CHECK-NEXT: add t0, a7, a3
+; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a1)
+; CHECK-NEXT: add a3, t0, a3
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vl1re32.v v12, (t0)
+; CHECK-NEXT: vl1re32.v v14, (a7)
+; CHECK-NEXT: vl1re32.v v8, (a6)
+; CHECK-NEXT: vl1re32.v v18, (a5)
+; CHECK-NEXT: vl1re32.v v20, (a4)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v10, 4
+; CHECK-NEXT: vslideup.vi v8, v14, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: vl1re32.v v16, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v20, v18, 4
+; CHECK-NEXT: vl1re32.v v12, (a2)
+; CHECK-NEXT: vslideup.vi v16, v12, 4
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v20, 8
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v8, 16
+; CHECK-NEXT: vse32.v v16, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <32 x i32> @llvm.vector.interleave8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h)
+ store <32 x i32> %v, ptr %p
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @vector_interleave_store_factor8_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
+; CHECK-LABEL: vector_interleave_store_factor8_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsseg8e32.v v8, (a0)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 0483bbbd35b39..b15ccdedda8f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -364,6 +364,41 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deint
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: vl4r.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 32 x i8>, ptr %p
+ %d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
+ %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 0
+ %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 1
+ %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 2
+ %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 3
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
+ %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: ret
@@ -406,6 +441,49 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res4
}
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor6(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub a1, a1, a2
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg6e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 48 x i8>, ptr %p
+ %d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave6(<vscale x 48 x i8> %vec)
+ %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 0
+ %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 1
+ %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 2
+ %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 3
+ %t4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 4
+ %t5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 5
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
+ %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
+ %res4 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t4, 4
+ %res5 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res4, <vscale x 8 x i8> %t5, 5
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res5
+}
+
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor7:
; CHECK: # %bb.0:
@@ -426,14 +504,57 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
%res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
%res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
%res4 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t4, 4
- %res5 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t5, 5
- %res6 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t6, 6
+ %res5 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res4, <vscale x 8 x i8> %t5, 5
+ %res6 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res5, <vscale x 8 x i8> %t6, 6
ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6
}
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) {
+define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor8(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vl8r.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg8e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = load <vscale x 64 x i8>, ptr %p
+ %d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave8(<vscale x 64 x i8> %vec)
+ %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 0
+ %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 1
+ %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 2
+ %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 3
+ %t4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 4
+ %t5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 5
+ %t6 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 6
+ %t7 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %d0, 7
+ %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
+ %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+ %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
+ %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
+ %res4 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3, <vscale x 8 x i8> %t4, 4
+ %res5 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res4, <vscale x 8 x i8> %t5, 5
+ %res6 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res5, <vscale x 8 x i8> %t6, 6
+ %res7 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6, <vscale x 8 x i8> %t7, 7
+ ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res7
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
+; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vlseg8e32.v v8, (a0)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 4332ca411d91b..26d387baa5e22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -250,9 +250,43 @@ define void @vector_interleave_store_factor3(<vscale x 2 x i32> %a, <vscale x 2
ret void
}
-define void @vector_interleave_store_factor4(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, ptr %p) {
+define void @vector_interleave_store_factor4(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v10, (a4)
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vl1re32.v v11, (a2)
+; CHECK-NEXT: vl1re32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v9, (a3)
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i32> @llvm.vector.interleave4(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d)
+ store <vscale x 8 x i32> %v, ptr %p
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @vector_interleave_store_factor4_recursive(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, ptr %p) {
+; CHECK-LABEL: vector_interleave_store_factor4_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vsseg4e32.v v8, (a0)
; CHECK-NEXT: ret
@@ -274,6 +308,135 @@ define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2
ret void
}
+define void @vector_interleave_store_factor6(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, ptr %p) {
+; RV32-LABEL: vector_interleave_store_factor6:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a3, a1, a2
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV32-NEXT: vsseg6e32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v11, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v8, (a1)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v12, (a4)
+; RV32-NEXT: add a4, a4, a2
+; RV32-NEXT: vl1re32.v v13, (a4)
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: srli a0, a2, 3
+; RV32-NEXT: li a1, 12
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (s0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 32
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_store_factor6:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: .cfi_def_cfa_offset 32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a3, a1, a2
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; RV64-NEXT: vsseg6e32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v11, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v8, (a1)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v12, (a4)
+; RV64-NEXT: add a4, a4, a2
+; RV64-NEXT: vl1re32.v v13, (a4)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: srli a0, a2, 3
+; RV64-NEXT: li a1, 12
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (s0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 32
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %v = call <vscale x 12 x i32> @llvm.vector.interleave6(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f)
+ store <vscale x 12 x i32> %v, ptr %p
+ ret void
+}
+
define void @vector_interleave_store_factor7(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor7:
; CHECK: # %bb.0:
@@ -288,6 +451,48 @@ define void @vector_interleave_store_factor7(<vscale x 2 x i32> %a, <vscale x 2
define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: add t0, a7, a2
+; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v14, (t0)
+; CHECK-NEXT: add a2, t0, a2
+; CHECK-NEXT: vl1re32.v v15, (a2)
+; CHECK-NEXT: vl1re32.v v12, (a6)
+; CHECK-NEXT: vl1re32.v v13, (a7)
+; CHECK-NEXT: vl1re32.v v10, (a4)
+; CHECK-NEXT: vl1re32.v v11, (a5)
+; CHECK-NEXT: vl1re32.v v8, (a1)
+; CHECK-NEXT: vl1re32.v v9, (a3)
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x i32> @llvm.vector.interleave8(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h)
+ store <vscale x 16 x i32> %v, ptr %p
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @vector_interleave_store_factor8_recursive(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
+; CHECK-LABEL: vector_interleave_store_factor8_recursive:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vsseg8e32.v v8, (a0)
; CHECK-NEXT: ret
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 0a20e03d0dff1..ebdeb70538d4a 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -135,6 +135,35 @@ define void @load_factor4(ptr %ptr) {
define void @load_factor4_vscale(ptr %ptr) {
; RV32-LABEL: @load_factor4_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
+; RV32-NEXT: [[TMP9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
+; RV32-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
+; RV32-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
+; RV32-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @load_factor4_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
+; RV64-NEXT: [[TMP9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
+; RV64-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
+; RV64-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
+; RV64-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
+ %v = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
+ %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %v, 0
+ %t1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %v, 1
+ %t2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %v, 2
+ %t3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %v, 3
+ ret void
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define void @load_factor4_vscale_recursive(ptr %ptr) {
+; RV32-LABEL: @load_factor4_vscale_recursive(
; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
; RV32-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
; RV32-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
@@ -150,7 +179,7 @@ define void @load_factor4_vscale(ptr %ptr) {
; RV32-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
; RV32-NEXT: ret void
;
-; RV64-LABEL: @load_factor4_vscale(
+; RV64-LABEL: @load_factor4_vscale_recursive(
; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
; RV64-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
; RV64-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
@@ -287,6 +316,40 @@ define void @load_factor6(ptr %ptr) {
ret void
}
+define void @load_factor6_vscale(ptr %ptr) {
+; RV32-LABEL: @load_factor6_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[PTR:%.*]], align 64
+; RV32-NEXT: [[TMP13:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave6.nxv12i32(<vscale x 12 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 0
+; RV32-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 1
+; RV32-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 2
+; RV32-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 3
+; RV32-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 4
+; RV32-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 5
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @load_factor6_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[PTR:%.*]], align 64
+; RV64-NEXT: [[TMP13:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave6.nxv12i32(<vscale x 12 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 0
+; RV64-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 1
+; RV64-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 2
+; RV64-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 3
+; RV64-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 4
+; RV64-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 5
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = load <vscale x 12 x i32>, ptr %ptr
+ %v = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave6.nxv12i32(<vscale x 12 x i32> %interleaved.vec)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 3
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 4
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 5
+ ret void
+}
+
define void @load_factor7(ptr %ptr) {
; RV32-LABEL: @load_factor7(
; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.mask.v4i32.p0.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4)
@@ -423,6 +486,47 @@ define void @load_factor8(ptr %ptr) {
define void @load_factor8_vscale(ptr %ptr) {
; RV32-LABEL: @load_factor8_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
+; RV32-NEXT: [[TMP17:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
+; RV32-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
+; RV32-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
+; RV32-NEXT: [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
+; RV32-NEXT: [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
+; RV32-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
+; RV32-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
+; RV32-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @load_factor8_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
+; RV64-NEXT: [[TMP17:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
+; RV64-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
+; RV64-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
+; RV64-NEXT: [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
+; RV64-NEXT: [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
+; RV64-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
+; RV64-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
+; RV64-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
+ %v = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 3
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 4
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 5
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 6
+ %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %v, 7
+ ret void
+}
+
+; TODO: Remove once recursive deinterleaving support is removed
+define void @load_factor8_vscale_recursive(ptr %ptr) {
+; RV32-LABEL: @load_factor8_vscale_recursive(
; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
; RV32-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
; RV32-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
@@ -450,7 +554,7 @@ define void @load_factor8_vscale(ptr %ptr) {
; RV32-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
; RV32-NEXT: ret void
;
-; RV64-LABEL: @load_factor8_vscale(
+; RV64-LABEL: @load_factor8_vscale_recursive(
; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
; RV64-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
; RV64-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
@@ -616,8 +720,25 @@ define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2
ret void
}
-define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
+define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
; RV32-LABEL: @store_factor4_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]])
+; RV32-NEXT: store <vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @store_factor4_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]])
+; RV64-NEXT: store <vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
+ store <vscale x 32 x i8> %interleaved.vec, ptr %ptr
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @store_factor4_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
+; RV32-LABEL: @store_factor4_vscale_recursive(
; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
@@ -625,7 +746,7 @@ define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
; RV32-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3)
; RV32-NEXT: ret void
;
-; RV64-LABEL: @store_factor4_vscale(
+; RV64-LABEL: @store_factor4_vscale_recursive(
; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
@@ -736,6 +857,22 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32
ret void
}
+define void @store_factor6_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5) {
+; RV32-LABEL: @store_factor6_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]])
+; RV32-NEXT: store <vscale x 48 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @store_factor6_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]])
+; RV64-NEXT: store <vscale x 48 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5)
+ store <vscale x 48 x i8> %interleaved.vec, ptr %ptr
+ ret void
+}
+
define void @store_factor7_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6) {
; RV32-LABEL: @store_factor7_vscale(
; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
@@ -764,8 +901,25 @@ define void @store_factor7_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
ret void
}
-define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
+define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6, <vscale x 8 x i8> %v7) {
; RV32-LABEL: @store_factor8_vscale(
+; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv64i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]])
+; RV32-NEXT: store <vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV32-NEXT: ret void
+;
+; RV64-LABEL: @store_factor8_vscale(
+; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv64i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]])
+; RV64-NEXT: store <vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV64-NEXT: ret void
+;
+ %interleaved.vec = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6, <vscale x 8 x i8> %v7)
+ store <vscale x 64 x i8> %interleaved.vec, ptr %ptr
+ ret void
+}
+
+; TODO: Remove once recursive interleaving support is removed
+define void @store_factor8_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
+; RV32-LABEL: @store_factor8_vscale_recursive(
; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
@@ -777,7 +931,7 @@ define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
; RV32-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3)
; RV32-NEXT: ret void
;
-; RV64-LABEL: @store_factor8_vscale(
+; RV64-LABEL: @store_factor8_vscale_recursive(
; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
>From 9e9c45da341de180b8fc3358b5bd6ed125c4a099 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 May 2025 17:07:10 +0100
Subject: [PATCH 2/2] [IA] Add support for [de]interleave{4,6,8}
---
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 46 ++++-
.../rvv/fixed-vectors-deinterleave-load.ll | 119 +----------
.../rvv/fixed-vectors-interleave-store.ll | 125 +-----------
.../RISCV/rvv/vector-deinterleave-load.ll | 50 -----
.../RISCV/rvv/vector-interleave-store.ll | 188 +-----------------
.../RISCV/interleaved-accesses.ll | 144 +++++++++++---
6 files changed, 175 insertions(+), 497 deletions(-)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 960c7956e0011..b684885b87a43 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -579,12 +579,21 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
case Intrinsic::vector_deinterleave3:
case Intrinsic::vector_interleave3:
return 3;
+ case Intrinsic::vector_deinterleave4:
+ case Intrinsic::vector_interleave4:
+ return 4;
case Intrinsic::vector_deinterleave5:
case Intrinsic::vector_interleave5:
return 5;
+ case Intrinsic::vector_deinterleave6:
+ case Intrinsic::vector_interleave6:
+ return 6;
case Intrinsic::vector_deinterleave7:
case Intrinsic::vector_interleave7:
return 7;
+ case Intrinsic::vector_deinterleave8:
+ case Intrinsic::vector_interleave8:
+ return 8;
default:
llvm_unreachable("Unexpected intrinsic");
}
@@ -605,10 +614,9 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
// to reorder them by interleaving these values.
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
unsigned NumLeaves = SubLeaves.size();
- if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves))
- return;
-
assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
+ if (NumLeaves == 2)
+ return;
const unsigned HalfLeaves = NumLeaves / 2;
// Visit the sub-trees.
@@ -629,8 +637,11 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
SmallVectorImpl<Instruction *> &DeadInsts) {
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 ||
II->getIntrinsicID() == Intrinsic::vector_interleave3 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave4 ||
II->getIntrinsicID() == Intrinsic::vector_interleave5 ||
- II->getIntrinsicID() == Intrinsic::vector_interleave7);
+ II->getIntrinsicID() == Intrinsic::vector_interleave6 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave7 ||
+ II->getIntrinsicID() == Intrinsic::vector_interleave8);
// Visit with BFS
SmallVector<IntrinsicInst *, 8> Queue;
@@ -660,13 +671,17 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
}
const unsigned Factor = Operands.size();
- // Currently we only recognize factors of 3, 5, 7, and powers of 2.
+ // Currently we only recognize factors 2...8 and other powers of 2.
// FIXME: should we assert here instead?
if (Factor <= 1 ||
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
return false;
- interleaveLeafValues(Operands);
+ // Recursively interleaved factors need to have their values reordered
+ // TODO: Remove once the loop vectorizer no longer recursively interleaves
+ // factors 4 + 8
+ if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
+ interleaveLeafValues(Operands);
return true;
}
@@ -676,8 +691,11 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
SmallVectorImpl<Instruction *> &DeadInsts) {
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 ||
II->getIntrinsicID() == Intrinsic::vector_deinterleave3 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave4 ||
II->getIntrinsicID() == Intrinsic::vector_deinterleave5 ||
- II->getIntrinsicID() == Intrinsic::vector_deinterleave7);
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave6 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave7 ||
+ II->getIntrinsicID() == Intrinsic::vector_deinterleave8);
using namespace PatternMatch;
if (!II->hasNUses(getIntrinsicFactor(II)))
return false;
@@ -737,13 +755,17 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
}
const unsigned Factor = Results.size();
- // Currently we only recognize factors of 3, 5, 7, and powers of 2.
+ // Currently we only recognize factors of 2...8 and other powers of 2.
// FIXME: should we assert here instead?
if (Factor <= 1 ||
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
return 0;
- interleaveLeafValues(Results);
+ // Recursively interleaved factors need to have their values reordered
+ // TODO: Remove once the loop vectorizer no longer recursively interleaves
+ // factors 4 + 8
+ if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
+ interleaveLeafValues(Results);
return true;
}
@@ -907,14 +929,20 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
switch (II->getIntrinsicID()) {
case Intrinsic::vector_deinterleave2:
case Intrinsic::vector_deinterleave3:
+ case Intrinsic::vector_deinterleave4:
case Intrinsic::vector_deinterleave5:
+ case Intrinsic::vector_deinterleave6:
case Intrinsic::vector_deinterleave7:
+ case Intrinsic::vector_deinterleave8:
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
break;
case Intrinsic::vector_interleave2:
case Intrinsic::vector_interleave3:
+ case Intrinsic::vector_interleave4:
case Intrinsic::vector_interleave5:
+ case Intrinsic::vector_interleave6:
case Intrinsic::vector_interleave7:
+ case Intrinsic::vector_interleave8:
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
break;
default:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 6b6c64b54956b..c2ae1ce491389 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -277,37 +277,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 1
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 24
-; CHECK-NEXT: vslidedown.vi v12, v8, 16
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 8
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v12, v10, a0
-; CHECK-NEXT: vslideup.vx v8, v9, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv.v.v v9, v12
-; CHECK-NEXT: vs2r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <32 x i8>, ptr %p
%d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave4(<32 x i8> %vec)
@@ -371,46 +342,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor6(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor6:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: li a1, 48
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a0, 40
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: srli a2, a2, 1
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v12, v8, a0
-; CHECK-NEXT: add a0, a2, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v14, v8, 24
-; CHECK-NEXT: vslidedown.vi v20, v8, 16
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v13, v8, 8
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v20, v14, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v13, a2
-; CHECK-NEXT: vslideup.vx v16, v12, a2
-; CHECK-NEXT: vmv1r.v v9, v20
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv2r.v v10, v16
-; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg6e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <48 x i8>, ptr %p
%d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave6(<48 x i8> %vec)
@@ -457,55 +390,9 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor8(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: li a1, 64
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a0, 56
-; CHECK-NEXT: li a1, 48
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: li a0, 40
-; CHECK-NEXT: vslidedown.vx v12, v8, a1
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: srli a2, a2, 1
-; CHECK-NEXT: vslidedown.vx v20, v8, a0
-; CHECK-NEXT: add a0, a2, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v14, v8, 24
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v12, v16, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v22, v8, 16
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v22, v14, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a1
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 8
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a2
-; CHECK-NEXT: vslideup.vx v16, v20, a2
-; CHECK-NEXT: vmv1r.v v9, v22
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v17, v12
-; CHECK-NEXT: vmv2r.v v10, v16
-; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vlseg8e8.v v8, (a0)
; CHECK-NEXT: vmv1r.v v15, v14
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <64 x i8>, ptr %p
%d0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.vector.deinterleave8(<64 x i8> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 6497675bd56f1..c394e7aa2e3e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -195,35 +195,8 @@ define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i3
define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg4e32.v v8, (a1)
-; CHECK-NEXT: add a2, a4, a2
-; CHECK-NEXT: vl1re32.v v8, (a4)
-; CHECK-NEXT: vl1re32.v v10, (a2)
-; CHECK-NEXT: vl1re32.v v12, (a1)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
-; CHECK-NEXT: vl1re32.v v10, (a3)
-; CHECK-NEXT: vslideup.vi v12, v10, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v12, v8, 8
-; CHECK-NEXT: vse32.v v12, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <16 x i32> @llvm.vector.interleave4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d)
store <16 x i32> %v, ptr %p
@@ -258,51 +231,8 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3
define void @vector_interleave_store_factor6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor6:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 32
-; CHECK-NEXT: add a4, a1, a2
-; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg6e32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v12, (a5)
-; CHECK-NEXT: add a5, a5, a2
-; CHECK-NEXT: vl1re32.v v10, (a5)
-; CHECK-NEXT: add a5, a5, a2
-; CHECK-NEXT: vl1re32.v v14, (a4)
-; CHECK-NEXT: vl1re32.v v8, (a1)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v12, v10, 4
-; CHECK-NEXT: vslideup.vi v8, v14, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 8
-; CHECK-NEXT: vl1re32.v v16, (a5)
-; CHECK-NEXT: add a2, a5, a2
-; CHECK-NEXT: vl1re32.v v12, (a2)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v16, v12, 4
-; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: vsetivli zero, 24, e32, m8, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg6e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <24 x i32> @llvm.vector.interleave6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f)
store <24 x i32> %v, ptr %p
@@ -323,51 +253,8 @@ define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i3
define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: add a2, a1, a3
-; CHECK-NEXT: add a4, a2, a3
-; CHECK-NEXT: add a5, a4, a3
-; CHECK-NEXT: add a6, a5, a3
-; CHECK-NEXT: add a7, a6, a3
-; CHECK-NEXT: add t0, a7, a3
-; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg8e32.v v8, (a1)
-; CHECK-NEXT: add a3, t0, a3
-; CHECK-NEXT: vl1re32.v v10, (a3)
-; CHECK-NEXT: vl1re32.v v12, (t0)
-; CHECK-NEXT: vl1re32.v v14, (a7)
-; CHECK-NEXT: vl1re32.v v8, (a6)
-; CHECK-NEXT: vl1re32.v v18, (a5)
-; CHECK-NEXT: vl1re32.v v20, (a4)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v12, v10, 4
-; CHECK-NEXT: vslideup.vi v8, v14, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 8
-; CHECK-NEXT: vl1re32.v v16, (a1)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v20, v18, 4
-; CHECK-NEXT: vl1re32.v v12, (a2)
-; CHECK-NEXT: vslideup.vi v16, v12, 4
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v16, v20, 8
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vslideup.vi v16, v8, 16
-; CHECK-NEXT: vse32.v v16, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <32 x i32> @llvm.vector.interleave8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h)
store <32 x i32> %v, ptr %p
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index b15ccdedda8f1..9344c52098684 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -364,23 +364,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deint
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: vl4r.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -444,28 +429,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor6(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor6:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub a1, a1, a2
-; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg6e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 48 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave6(<vscale x 48 x i8> %vec)
@@ -512,23 +477,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor8(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vl8r.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg8e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 64 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave8(<vscale x 64 x i8> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 26d387baa5e22..3751967f18aa4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -253,30 +253,8 @@ define void @vector_interleave_store_factor3(<vscale x 2 x i32> %a, <vscale x 2
define void @vector_interleave_store_factor4(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor4:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg4e32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v10, (a4)
-; CHECK-NEXT: add a2, a4, a2
-; CHECK-NEXT: vl1re32.v v11, (a2)
-; CHECK-NEXT: vl1re32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v9, (a3)
-; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.vector.interleave4(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d)
store <vscale x 8 x i32> %v, ptr %p
@@ -309,129 +287,11 @@ define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2
}
define void @vector_interleave_store_factor6(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, ptr %p) {
-; RV32-LABEL: vector_interleave_store_factor6:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a3, a1, a2
-; RV32-NEXT: add a4, a3, a2
-; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV32-NEXT: vsseg6e32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v10, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v11, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v8, (a1)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v12, (a4)
-; RV32-NEXT: add a4, a4, a2
-; RV32-NEXT: vl1re32.v v13, (a4)
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: mv s0, a0
-; RV32-NEXT: srli a0, a2, 3
-; RV32-NEXT: li a1, 12
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (s0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 32
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_interleave_store_factor6:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -32
-; RV64-NEXT: .cfi_def_cfa_offset 32
-; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 14 * vlenb
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a3, a1, a2
-; RV64-NEXT: add a4, a3, a2
-; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma
-; RV64-NEXT: vsseg6e32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v10, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v11, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v8, (a1)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v12, (a4)
-; RV64-NEXT: add a4, a4, a2
-; RV64-NEXT: vl1re32.v v13, (a4)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: mv s0, a0
-; RV64-NEXT: srli a0, a2, 3
-; RV64-NEXT: li a1, 12
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (s0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 32
-; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 32
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: vector_interleave_store_factor6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg6e32.v v8, (a0)
+; CHECK-NEXT: ret
%v = call <vscale x 12 x i32> @llvm.vector.interleave6(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f)
store <vscale x 12 x i32> %v, ptr %p
ret void
@@ -451,38 +311,8 @@ define void @vector_interleave_store_factor7(<vscale x 2 x i32> %a, <vscale x 2
define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
; CHECK-LABEL: vector_interleave_store_factor8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: add a6, a5, a2
-; CHECK-NEXT: add a7, a6, a2
-; CHECK-NEXT: add t0, a7, a2
-; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg8e32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v14, (t0)
-; CHECK-NEXT: add a2, t0, a2
-; CHECK-NEXT: vl1re32.v v15, (a2)
-; CHECK-NEXT: vl1re32.v v12, (a6)
-; CHECK-NEXT: vl1re32.v v13, (a7)
-; CHECK-NEXT: vl1re32.v v10, (a4)
-; CHECK-NEXT: vl1re32.v v11, (a5)
-; CHECK-NEXT: vl1re32.v v8, (a1)
-; CHECK-NEXT: vl1re32.v v9, (a3)
-; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a0)
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.vector.interleave8(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h)
store <vscale x 16 x i32> %v, ptr %p
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index ebdeb70538d4a..87b16d17aa5f0 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -135,8 +135,15 @@ define void @load_factor4(ptr %ptr) {
define void @load_factor4_vscale(ptr %ptr) {
; RV32-LABEL: @load_factor4_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
-; RV32-NEXT: [[TMP9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
+; RV32-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
+; RV32-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
+; RV32-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
+; RV32-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
+; RV32-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
+; RV32-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
+; RV32-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
; RV32-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
; RV32-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
; RV32-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
@@ -144,8 +151,15 @@ define void @load_factor4_vscale(ptr %ptr) {
; RV32-NEXT: ret void
;
; RV64-LABEL: @load_factor4_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
-; RV64-NEXT: [[TMP9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
+; RV64-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
+; RV64-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
+; RV64-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
+; RV64-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
+; RV64-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
+; RV64-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
+; RV64-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
; RV64-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
; RV64-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
; RV64-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
@@ -318,8 +332,19 @@ define void @load_factor6(ptr %ptr) {
define void @load_factor6_vscale(ptr %ptr) {
; RV32-LABEL: @load_factor6_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[PTR:%.*]], align 64
-; RV32-NEXT: [[TMP13:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave6.nxv12i32(<vscale x 12 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 0)
+; RV32-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
+; RV32-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 1)
+; RV32-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
+; RV32-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 2)
+; RV32-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
+; RV32-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 3)
+; RV32-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
+; RV32-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 4)
+; RV32-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
+; RV32-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 5)
+; RV32-NEXT: [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
; RV32-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 0
; RV32-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 1
; RV32-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 2
@@ -329,8 +354,19 @@ define void @load_factor6_vscale(ptr %ptr) {
; RV32-NEXT: ret void
;
; RV64-LABEL: @load_factor6_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[PTR:%.*]], align 64
-; RV64-NEXT: [[TMP13:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave6.nxv12i32(<vscale x 12 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 0)
+; RV64-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
+; RV64-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 1)
+; RV64-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
+; RV64-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 2)
+; RV64-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
+; RV64-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 3)
+; RV64-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 4)
+; RV64-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
+; RV64-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 5)
+; RV64-NEXT: [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
; RV64-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 0
; RV64-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 1
; RV64-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], 2
@@ -486,8 +522,23 @@ define void @load_factor8(ptr %ptr) {
define void @load_factor8_vscale(ptr %ptr) {
; RV32-LABEL: @load_factor8_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
-; RV32-NEXT: [[TMP17:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
+; RV32-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
+; RV32-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
+; RV32-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
+; RV32-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
+; RV32-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
+; RV32-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
+; RV32-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
+; RV32-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
+; RV32-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
+; RV32-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
+; RV32-NEXT: [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
+; RV32-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
+; RV32-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
+; RV32-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
+; RV32-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
; RV32-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
; RV32-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
; RV32-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
@@ -499,8 +550,23 @@ define void @load_factor8_vscale(ptr %ptr) {
; RV32-NEXT: ret void
;
; RV64-LABEL: @load_factor8_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[PTR:%.*]], align 64
-; RV64-NEXT: [[TMP17:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
+; RV64-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
+; RV64-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
+; RV64-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
+; RV64-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
+; RV64-NEXT: [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
+; RV64-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
+; RV64-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
+; RV64-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
+; RV64-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
+; RV64-NEXT: [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
+; RV64-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
+; RV64-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
+; RV64-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
+; RV64-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
; RV64-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
; RV64-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
; RV64-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
@@ -722,13 +788,19 @@ define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2
define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
; RV32-LABEL: @store_factor4_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]])
-; RV32-NEXT: store <vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV32-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3)
; RV32-NEXT: ret void
;
; RV64-LABEL: @store_factor4_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]])
-; RV64-NEXT: store <vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 32
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV64-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3)
; RV64-NEXT: ret void
;
%interleaved.vec = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
@@ -859,13 +931,23 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32
define void @store_factor6_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5) {
; RV32-LABEL: @store_factor6_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]])
-; RV32-NEXT: store <vscale x 48 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
+; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
+; RV32-NEXT: call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], i32 -1, i32 3)
; RV32-NEXT: ret void
;
; RV64-LABEL: @store_factor6_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]])
-; RV64-NEXT: store <vscale x 48 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
+; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
+; RV64-NEXT: call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], i64 -1, i64 3)
; RV64-NEXT: ret void
;
%interleaved.vec = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5)
@@ -903,13 +985,27 @@ define void @store_factor7_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6, <vscale x 8 x i8> %v7) {
; RV32-LABEL: @store_factor8_vscale(
-; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv64i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]])
-; RV32-NEXT: store <vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
+; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
+; RV32-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
+; RV32-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V7:%.*]], i32 7)
+; RV32-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3)
; RV32-NEXT: ret void
;
; RV64-LABEL: @store_factor8_vscale(
-; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv64i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]])
-; RV64-NEXT: store <vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[PTR:%.*]], align 64
+; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
+; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
+; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
+; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
+; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
+; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
+; RV64-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
+; RV64-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V7:%.*]], i32 7)
+; RV64-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3)
; RV64-NEXT: ret void
;
%interleaved.vec = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6, <vscale x 8 x i8> %v7)
More information about the llvm-commits
mailing list