[llvm] [RISCV] Support postRA vsetvl insertion pass (PR #70549)
Piyou Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 1 02:58:11 PDT 2024
https://github.com/BeMg updated https://github.com/llvm/llvm-project/pull/70549
>From 540a461c8c75768c746b36b7f135f948ee8f13a2 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Wed, 24 Jan 2024 22:46:58 -0800
Subject: [PATCH 1/5] Precommit for testcase
---
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 1733 +++++++++++++++++
1 file changed, 1733 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
new file mode 100644
index 00000000000000..8204cec7e27794
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -0,0 +1,1733 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b -target-abi=lp64d \
+; RUN: --riscv-split-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb) nounwind strictfp {
+; CHECK-LABEL: fcmp_ole_vv_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vmfeq.vv v10, v9, v9
+; CHECK-NEXT: vmfeq.vv v11, v8, v8
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: vmfle.vv v0, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
+ ret <vscale x 2 x i1> %1
+}
+
+define dso_local void @test_interleave_cause_spill(ptr nocapture noundef %in) local_unnamed_addr #0 {
+; CHECK-LABEL: test_interleave_cause_spill:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a1, a0, 4
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 8
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: addi a1, a0, 12
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a0, 16
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v0, (a1)
+; CHECK-NEXT: addi a1, a0, 20
+; CHECK-NEXT: vle32.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 24
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vv v24, v8, v24
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vadd.vv v20, v0, v4
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vadd.vv v8, v8, v24
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vadd.vv v16, v0, v16
+; CHECK-NEXT: addi a1, a0, 40
+; CHECK-NEXT: vse32.v v20, (a1)
+; CHECK-NEXT: addi a1, a0, 44
+; CHECK-NEXT: vse32.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 48
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v16, (a1)
+; CHECK-NEXT: addi a0, a0, 52
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+entry:
+ %add.ptr = getelementptr inbounds i32, ptr %in, i64 1
+ %0 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr, i64 32)
+ %add.ptr1 = getelementptr inbounds i32, ptr %in, i64 2
+ %1 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr1, i64 32)
+ %add.ptr2 = getelementptr inbounds i32, ptr %in, i64 3
+ %2 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr2, i64 32)
+ %add.ptr3 = getelementptr inbounds i32, ptr %in, i64 4
+ %3 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr3, i64 32)
+ %add.ptr4 = getelementptr inbounds i32, ptr %in, i64 5
+ %4 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr4, i64 32)
+ %add.ptr5 = getelementptr inbounds i32, ptr %in, i64 6
+ %5 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr5, i64 32)
+ %6 = tail call <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 32)
+ %7 = tail call <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> %3, <vscale x 8 x i32> %4, i64 32)
+ %8 = tail call <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %0, <vscale x 16 x i32> %2, i64 32)
+ %9 = tail call <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> %3, <vscale x 8 x i32> %5, i64 32)
+ %add.ptr6 = getelementptr inbounds i32, ptr %in, i64 10
+ tail call void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32> %7, ptr nonnull %add.ptr6, i64 32)
+ %add.ptr7 = getelementptr inbounds i32, ptr %in, i64 11
+ tail call void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32> %9, ptr nonnull %add.ptr7, i64 32)
+ %add.ptr8 = getelementptr inbounds i32, ptr %in, i64 12
+ tail call void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32> %6, ptr nonnull %add.ptr8, i64 32)
+ %add.ptr9 = getelementptr inbounds i32, ptr %in, i64 13
+ tail call void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32> %8, ptr nonnull %add.ptr9, i64 32)
+ ret void
+}
+
+define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlz_v15i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsrl.vx v16, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: lui a0, 4112
+; CHECK-NEXT: addiw a0, a0, 257
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 56
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
+ ret <15 x i64> %v
+}
+
+define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_nxv7i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 7 x i64> @llvm.vp.bitreverse.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
+ ret <vscale x 7 x i64> %v
+}
+
+define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_nxv8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i64> @llvm.vp.bitreverse.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i64> %v
+}
+
+define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b, <4 x i1> %sel) {
+; CHECK-LABEL: constant_folding_crash:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld a0, 8(a0)
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v13, a0
+; CHECK-NEXT: vmsne.vi v0, v13, 0
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 10
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+entry:
+ %sunkaddr = getelementptr i8, ptr %v54, i64 8
+ %v56 = load i64, ptr %sunkaddr, align 8
+ %trunc = and i64 %v56, 1
+ %cmp = icmp eq i64 %trunc, 0
+ %ptrs = select i1 %cmp, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b
+ %v67 = extractelement <4 x ptr> %ptrs, i64 0
+ %mask = shufflevector <4 x i1> %sel, <4 x i1> undef, <4 x i32> zeroinitializer
+ call void @llvm.masked.store.v4i32.p0(<4 x i32> <i32 10, i32 10, i32 10, i32 10>, ptr %v67, i32 16, <4 x i1> %mask)
+ ret void
+}
+
+define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctpop_nxv16i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 3
+; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v24, v0, a2
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: bltu a0, a1, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a2, 349525
+; CHECK-NEXT: addiw a2, a2, 1365
+; CHECK-NEXT: slli a3, a2, 32
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a3, 209715
+; CHECK-NEXT: addiw a3, a3, 819
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a4, 61681
+; CHECK-NEXT: addiw a4, a4, -241
+; CHECK-NEXT: slli a5, a4, 32
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT: lui a5, 4112
+; CHECK-NEXT: addiw a5, a5, 257
+; CHECK-NEXT: slli a6, a5, 32
+; CHECK-NEXT: add a5, a5, a6
+; CHECK-NEXT: vmul.vx v8, v8, a5, v0.t
+; CHECK-NEXT: li a6, 56
+; CHECK-NEXT: vsrl.vx v8, v8, a6, v0.t
+; CHECK-NEXT: addi a7, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: sltu a0, a0, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vand.vx v8, v16, a3, v0.t
+; CHECK-NEXT: vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT: vmul.vx v8, v8, a5, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a6, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x i64> %v
+}
+
+define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_v15i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
+ ret <15 x i64> %v
+}
+
+define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: add_constant_rhs_8xi32_partial:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
+; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 5
+; CHECK-NEXT: vmv.s.x v10, a2
+; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 6
+; CHECK-NEXT: vmv.s.x v10, a3
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vle32.v v12, (a0)
+; CHECK-NEXT: vslideup.vi v8, v10, 7
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
+ %vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+ %e0 = add i32 %a, 23
+ %e1 = add i32 %b, 25
+ %e2 = add i32 %c, 1
+ %e3 = add i32 %d, 2355
+ %v0 = insertelement <8 x i32> %vadd, i32 %e0, i32 4
+ %v1 = insertelement <8 x i32> %v0, i32 %e1, i32 5
+ %v2 = insertelement <8 x i32> %v1, i32 %e2, i32 6
+ %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 7
+ ret <8 x i32> %v3
+}
+
+define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
+; CHECK-LABEL: fp2si_v8f64_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vand.vi v8, v12, 1
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: ret
+ %z = fptosi <8 x double> %x to <8 x i1>
+ ret <8 x i1> %z
+}
+
+define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
+; CHECK-LABEL: insert_v8i32_v2i32_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vse32.v v10, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, ptr %svp
+ %vec = load <8 x i32>, ptr %vp
+ %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
+ store <8 x i32> %v, ptr %vp
+ ret void
+}
+
+define void @buildvec_seq_v9i8(ptr %x) {
+; CHECK-LABEL: buildvec_seq_v9i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 73
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 3
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: li a1, 146
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 2, v0
+; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma
+; CHECK-NEXT: vse8.v v8, (a0)
+; CHECK-NEXT: ret
+ store <9 x i8> <i8 1, i8 2, i8 3, i8 1, i8 2, i8 3, i8 1, i8 2, i8 3>, ptr %x
+ ret void
+}
+
+define <4 x i1> @load_large_vector(ptr %p) {
+; CHECK-LABEL: load_large_vector:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vlseg3e64.v v8, (a0)
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: ret
+ %l = load <12 x ptr>, ptr %p
+ %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %ret = icmp ne <4 x ptr> %s1, %s2
+ ret <4 x i1> %ret
+}
+
+define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
+; CHECK-LABEL: load_factor6_too_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 52
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: addi a2, a1, 256
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 27
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a2, a1, 128
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 35
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vrgather.vi v8, v16, 4
+; CHECK-NEXT: li a1, 128
+; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v16, 8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v8, v24, 2, v0.t
+; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vmul.vx v2, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v2
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vadd.vi v16, v2, -16
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 5
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 3, v0.t
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs2r.v v2, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vadd.vi v24, v2, 1
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v24, v2, -15
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs2r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v2, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vadd.vi v4, v2, 2
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vadd.vi v16, v2, -14
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v12, 6
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v24, v16
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v16, v24, 4, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v28, v2, 3
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v28
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v16, v2, -13
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: lui a1, 16
+; CHECK-NEXT: addi a1, a1, 7
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v24, v16, v12
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v24, v16, 5, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v24, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: lui a1, 96
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: li a1, 192
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v4, v24, 2
+; CHECK-NEXT: vrgatherei16.vv v4, v16, v8, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v26, v2, 4
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v26
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 28
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vadd.vi v16, v2, -12
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: lui a1, 112
+; CHECK-NEXT: addi a1, a1, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 3
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v12, v2, 5
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v12, v2, -11
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: addi a1, a0, 320
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vse64.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 192
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a3, a2, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a3, a2, 3
+; CHECK-NEXT: sub a2, a3, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 64
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 11
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 52
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %interleaved.vec = load <48 x i64>, ptr %ptr
+ %v0 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
+ %v1 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
+ %v2 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
+ %v3 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
+ %v4 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
+ %v5 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
+ %res0 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} undef, <8 x i64> %v0, 0
+ %res1 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res0, <8 x i64> %v1, 1
+ %res2 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res1, <8 x i64> %v2, 2
+ %res3 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res2, <8 x i64> %v3, 3
+ %res4 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res3, <8 x i64> %v4, 4
+ %res5 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res4, <8 x i64> %v5, 5
+ ret {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res5
+}
+
+
+define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vtrunc_v128i32_v128i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 56
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT: vmv1r.v v4, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v1, v0, 8
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v2, v0, 4
+; CHECK-NEXT: addi a2, a1, 512
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v27, v1, 4
+; CHECK-NEXT: addi a2, a1, 640
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: addi a2, a7, -64
+; CHECK-NEXT: sltu a3, a7, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a4, a3, a2
+; CHECK-NEXT: addi a2, a4, -32
+; CHECK-NEXT: sltu a3, a4, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a2
+; CHECK-NEXT: addi a2, a3, -16
+; CHECK-NEXT: sltu a5, a3, a2
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a2, a5, a2
+; CHECK-NEXT: vslidedown.vi v0, v27, 2
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: addi a5, a1, 128
+; CHECK-NEXT: bltu a3, a2, .LBB14_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v2, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a5)
+; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
+; CHECK-NEXT: li a3, 64
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 48
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a7, a3, .LBB14_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: li a7, 64
+; CHECK-NEXT: .LBB14_4:
+; CHECK-NEXT: addi a5, a1, 384
+; CHECK-NEXT: li a3, 32
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: addi a6, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a6, a7, -32
+; CHECK-NEXT: sltu t0, a7, a6
+; CHECK-NEXT: addi t0, t0, -1
+; CHECK-NEXT: and a6, t0, a6
+; CHECK-NEXT: addi t0, a6, -16
+; CHECK-NEXT: sltu t1, a6, t0
+; CHECK-NEXT: addi t1, t1, -1
+; CHECK-NEXT: and t0, t1, t0
+; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a6, a2, .LBB14_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: li a6, 16
+; CHECK-NEXT: .LBB14_6:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v3, v1, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a1, a1, 256
+; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v2
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a4, a3, .LBB14_8
+; CHECK-NEXT: # %bb.7:
+; CHECK-NEXT: li a4, 32
+; CHECK-NEXT: .LBB14_8:
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a4, -16
+; CHECK-NEXT: sltu a5, a4, a1
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a1, a5, a1
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT: bltu a4, a2, .LBB14_10
+; CHECK-NEXT: # %bb.9:
+; CHECK-NEXT: li a4, 16
+; CHECK-NEXT: .LBB14_10:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v5, v4, 2
+; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: bltu a7, a3, .LBB14_12
+; CHECK-NEXT: # %bb.11:
+; CHECK-NEXT: li a7, 32
+; CHECK-NEXT: .LBB14_12:
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 48
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 48
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv4r.v v8, v0
+; CHECK-NEXT: vslideup.vi v8, v24, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a7, -16
+; CHECK-NEXT: sltu a4, a7, a1
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v5
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: bltu a7, a2, .LBB14_14
+; CHECK-NEXT: # %bb.13:
+; CHECK-NEXT: li a7, 16
+; CHECK-NEXT: .LBB14_14:
+; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v24, v16, 16
+; CHECK-NEXT: vse32.v v24, (a0)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: addi a0, a0, 384
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 48
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> %a, <128 x i1> %m, i32 %vl)
+ ret <128 x i32> %v
+}
+
+
+define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v2i32_align1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vmseq.vi v8, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: andi a3, a2, 1
+; CHECK-NEXT: # implicit-def: $v8
+; CHECK-NEXT: beqz a3, .LBB15_2
+; CHECK-NEXT: # %bb.1: # %cond.load
+; CHECK-NEXT: lbu a3, 1(a0)
+; CHECK-NEXT: lbu a4, 0(a0)
+; CHECK-NEXT: lbu a5, 2(a0)
+; CHECK-NEXT: lb a6, 3(a0)
+; CHECK-NEXT: slli a3, a3, 8
+; CHECK-NEXT: or a3, a3, a4
+; CHECK-NEXT: slli a5, a5, 16
+; CHECK-NEXT: slli a6, a6, 24
+; CHECK-NEXT: or a4, a6, a5
+; CHECK-NEXT: or a3, a4, a3
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a3
+; CHECK-NEXT: .LBB15_2: # %else
+; CHECK-NEXT: andi a2, a2, 2
+; CHECK-NEXT: beqz a2, .LBB15_4
+; CHECK-NEXT: # %bb.3: # %cond.load1
+; CHECK-NEXT: lbu a2, 5(a0)
+; CHECK-NEXT: lbu a3, 4(a0)
+; CHECK-NEXT: lbu a4, 6(a0)
+; CHECK-NEXT: lb a0, 7(a0)
+; CHECK-NEXT: slli a2, a2, 8
+; CHECK-NEXT: or a2, a2, a3
+; CHECK-NEXT: slli a4, a4, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a4
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.s.x v9, a0
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: .LBB15_4: # %else2
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: ret
+ %mask = icmp eq <2 x i32> %m, zeroinitializer
+ %load = call <2 x i32> @llvm.masked.load.v2i32(ptr %a, i32 1, <2 x i1> %mask, <2 x i32> undef)
+ store <2 x i32> %load, ptr %res_ptr
+ ret void
+}
+
+define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_v11i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <11 x i64> poison, i64 %b, i32 0
+ %vb = shufflevector <11 x i64> %elt.head, <11 x i64> poison, <11 x i32> zeroinitializer
+ %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+ ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_v11i64_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: ret
+ %elt.head = insertelement <11 x i64> poison, i64 %b, i32 0
+ %vb = shufflevector <11 x i64> %elt.head, <11 x i64> poison, <11 x i32> zeroinitializer
+ %head = insertelement <11 x i1> poison, i1 true, i32 0
+ %m = shufflevector <11 x i1> %head, <11 x i1> poison, <11 x i32> zeroinitializer
+ %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+ ret <11 x i64> %v
+}
+
+define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 zeroext %evl) {
+; CHECK-LABEL: select_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: bltu a2, a1, .LBB18_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: .LBB18_2:
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: addi a0, a2, -16
+; CHECK-NEXT: sltu a1, a2, a0
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v0, v0, 2
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 %evl)
+ ret <32 x i64> %v
+}
+
+define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double> %val1, <vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptrs1, <vscale x 16 x i1> %m) {
+; CHECK-LABEL: mscatter_nxv16f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vl8re64.v v24, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v16, (a1)
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsoxei64.v v8, (zero), v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsoxei64.v v8, (zero), v16, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %p0 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> undef, <vscale x 8 x ptr> %ptrs0, i64 0)
+ %p1 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> %p0, <vscale x 8 x ptr> %ptrs1, i64 8)
+ %v0 = call <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double> undef, <vscale x 8 x double> %val0, i64 0)
+ %v1 = call <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double> %v0, <vscale x 8 x double> %val1, i64 8)
+ call void @llvm.masked.scatter.nxv16f64.nxv16p0(<vscale x 16 x double> %v1, <vscale x 16 x ptr> %p1, i32 8, <vscale x 16 x i1> %m)
+ ret void
+}
+
+define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i64_nxv16i64(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_nxv8i64_nxv16i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v16, v8, v8
+; CHECK-NEXT: vrgather.vv v8, v0, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v24, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vadd.vi v8, v16, 1
+; CHECK-NEXT: vrgather.vv v16, v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v24, v0, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v28, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v20, v8
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %vec = load <vscale x 16 x i64>, ptr %p
+ %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+ ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
+}
+
+
+define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x half> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT: vfmacc.vv v8, v16, v24
+; CHECK-NEXT: ret
+ %vd = call <vscale x 32 x half> @llvm.fma.v32f16(<vscale x 32 x half> %vc, <vscale x 32 x half> %vb, <vscale x 32 x half> %va)
+ ret <vscale x 32 x half> %vd
+}
+
+
+define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a2, a0, a1
+; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v28, v16, v0.t
+; CHECK-NEXT: bltu a0, a1, .LBB22_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsext.vf4 v16, v8
+; CHECK-NEXT: vsll.vi v24, v16, 3
+; CHECK-NEXT: vsext.vf4 v16, v10
+; CHECK-NEXT: vsll.vi v16, v16, 3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a4, a2, 3
+; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a4
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vluxei64.v v16, (a0), v16, v0.t
+; CHECK-NEXT: bltu a1, a2, .LBB23_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vluxei64.v v8, (a0), v24, v0.t
+; CHECK-NEXT: ret
+ %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i16> %idxs
+ %v = call <vscale x 16 x double> @llvm.vp.gather.nxv16f64.nxv16p0(<vscale x 16 x ptr> %ptrs, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x double> %v
+}
+
+
+define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a1, a3, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re32.v v8, (a1)
+; CHECK-NEXT: slli a1, a3, 1
+; CHECK-NEXT: sub a4, a2, a1
+; CHECK-NEXT: sltu a5, a2, a4
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a4, a5, a4
+; CHECK-NEXT: srli a3, a3, 2
+; CHECK-NEXT: vl8re32.v v0, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a3
+; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: bltu a2, a1, .LBB24_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB24_2:
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 %evl)
+ ret <vscale x 32 x i32> %v
+}
+
+define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <vscale x 4 x i64>* %y) {
+; CHECK-LABEL: illegal_preserve_vl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v12, v12, v12
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %index = add <vscale x 4 x i64> %x, %x
+ store <vscale x 4 x i64> %index, <vscale x 4 x i64>* %y
+ %elt = extractelement <vscale x 2 x i32> %a, i64 0
+ ret i32 %elt
+}
+
+
+define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsitofp_nxv32f16_nxv32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a2, a0, a1
+; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfncvt.f.x.w v28, v16, v0.t
+; CHECK-NEXT: bltu a0, a1, .LBB26_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB26_2:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vfncvt.f.x.w v24, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <4 x float> @tail_vmv_v_i_treat_as_vmv_s_x(<8 x float> %x, <8 x float> %y) optsize {
+; CHECK-LABEL: tail_vmv_v_i_treat_as_vmv_s_x:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vid.v v12
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vmul.vx v14, v12, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vadd.vi v8, v14, -14
+; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
+; CHECK-NEXT: vmv1r.v v8, v12
+; CHECK-NEXT: ret
+ %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
+ ret <4 x float> %z
+}
+
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
+declare <11 x i64> @llvm.vp.and.v11i64(<11 x i64>, <11 x i64>, <11 x i1>, i32)
+declare <vscale x 16 x double> @llvm.vp.gather.nxv16f64.nxv16p0(<vscale x 16 x ptr>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32>, ptr nocapture, i64)
+declare <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32>, ptr nocapture, i64)
+declare <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, i64)
+declare void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32>, ptr nocapture, i64)
+declare void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32>, ptr nocapture, i64)
+declare <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1>, <vscale x 32 x i32>, <vscale x 32 x i32>, i32)
+declare <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x half> @llvm.fma.v32f16(<vscale x 32 x half>, <vscale x 32 x half>, <vscale x 32 x half>)
+declare void @llvm.masked.scatter.nxv16f64.nxv16p0(<vscale x 16 x double>, <vscale x 16 x ptr>, i32, <vscale x 16 x i1>)
+declare <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double>, <vscale x 8 x double>, i64)
+declare <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr>, <vscale x 8 x ptr>, i64)
+declare <32 x i64> @llvm.vp.select.v32i64(<32 x i1>, <32 x i64>, <32 x i64>, i32)
+declare <2 x i32> @llvm.masked.load.v2i32(ptr, i32, <2 x i1>, <2 x i32>)
+declare <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64>, <128 x i1>, i32)
+declare <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64>, <15 x i1>, i32)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.bitreverse.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 7 x i64> @llvm.vp.bitreverse.nxv7i64(<vscale x 7 x i64>, <vscale x 7 x i1>, i32)
+declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, metadata, metadata)
>From 0475023f8b5874a5b9f5c77cc8e287f02ec88c4c Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Wed, 24 Jan 2024 22:47:58 -0800
Subject: [PATCH 2/5] [RISCV] postRA vsetvl insertion pass
---
.../llvm}/CodeGen/LiveDebugVariables.h | 2 +-
llvm/lib/CodeGen/LiveDebugVariables.cpp | 2 +-
llvm/lib/CodeGen/RegAllocBasic.cpp | 2 +-
llvm/lib/CodeGen/RegAllocGreedy.cpp | 2 +-
llvm/lib/CodeGen/VirtRegMap.cpp | 2 +-
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 393 +++++--
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 14 +-
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 1029 ++++++++++-------
8 files changed, 913 insertions(+), 533 deletions(-)
rename llvm/{lib => include/llvm}/CodeGen/LiveDebugVariables.h (96%)
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.h b/llvm/include/llvm/CodeGen/LiveDebugVariables.h
similarity index 96%
rename from llvm/lib/CodeGen/LiveDebugVariables.h
rename to llvm/include/llvm/CodeGen/LiveDebugVariables.h
index 9998ce9e8dad86..3643a2cd0981f1 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveDebugVariables.h
@@ -29,7 +29,7 @@ template <typename T> class ArrayRef;
class LiveIntervals;
class VirtRegMap;
-class LLVM_LIBRARY_VISIBILITY LiveDebugVariables : public MachineFunctionPass {
+class LiveDebugVariables : public MachineFunctionPass {
void *pImpl = nullptr;
public:
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 7cb90af5ff173e..3a59ae7ab06644 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -18,7 +18,7 @@
//
//===----------------------------------------------------------------------===//
-#include "LiveDebugVariables.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 6661991396302e..5bd3b126aa1666 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -12,10 +12,10 @@
//===----------------------------------------------------------------------===//
#include "AllocationOrder.h"
-#include "LiveDebugVariables.h"
#include "RegAllocBase.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index a208bf89fadf29..348277224c7aee 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -14,7 +14,6 @@
#include "RegAllocGreedy.h"
#include "AllocationOrder.h"
#include "InterferenceCache.h"
-#include "LiveDebugVariables.h"
#include "RegAllocBase.h"
#include "RegAllocEvictionAdvisor.h"
#include "RegAllocPriorityAdvisor.h"
@@ -31,6 +30,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalUnion.h"
#include "llvm/CodeGen/LiveIntervals.h"
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 48f4ee29fbe95d..2c778980f5d1e0 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -16,9 +16,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/VirtRegMap.h"
-#include "LiveDebugVariables.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveStacks.h"
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a14f9a28354737..86073c881486b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -27,7 +27,9 @@
#include "RISCV.h"
#include "RISCVSubtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <queue>
using namespace llvm;
@@ -48,6 +50,44 @@ static cl::opt<bool> UseStrictAsserts(
namespace {
+// For the SSA form, we could just use the getVRegDef to take Reaching
+// definition. For the non-SSA, we retrieve reaching definition for specific
+// register from LiveInterval/VNInfo.
+template <typename T>
+static T *getReachingDefMI(Register Reg, T *MI, const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
+ if (MRI->isSSA())
+ return MRI->getVRegDef(Reg);
+
+ if (!MI)
+ return MRI->getUniqueVRegDef(Reg);
+
+ // For O0 situation
+ if (!LIS)
+ return nullptr;
+
+ // If MI is DefMI
+ if (llvm::any_of(MI->defs(), [Reg](const MachineOperand MO) {
+ return MO.isReg() && MO.getReg() == Reg;
+ }))
+ return MI;
+
+ if (Reg.isVirtual() && LIS->hasInterval(Reg)) {
+ auto &LI = LIS->getInterval(Reg);
+ SlotIndexes *SIs = LIS->getSlotIndexes();
+ SlotIndex SI = SIs->getInstructionIndex(*MI);
+ VNInfo *Valno = LI.getVNInfoBefore(SI);
+ if (!Valno || Valno->isPHIDef())
+ return nullptr;
+ MachineInstr *DefMI = SIs->getInstructionFromIndex(Valno->def);
+ return DefMI;
+ }
+
+ // TODO: Handle physical register
+
+ return nullptr;
+}
+
static unsigned getVLOpNum(const MachineInstr &MI) {
return RISCVII::getVLOpNum(MI.getDesc());
}
@@ -175,7 +215,8 @@ static bool isMaskRegOp(const MachineInstr &MI) {
/// specification. Agnostic requires each lane to either be undisturbed, or
/// take the value -1; no other value is allowed.
static bool hasUndefinedMergeOp(const MachineInstr &MI,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
unsigned UseOpIdx;
if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -190,13 +231,18 @@ static bool hasUndefinedMergeOp(const MachineInstr &MI,
if (UseMO.getReg() == RISCV::NoRegister)
return true;
- if (MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg())) {
+ if (!MRI.isSSA() && UseMO.isUndef())
+ return true;
+
+ if (const MachineInstr *UseMI =
+ getReachingDefMI(UseMO.getReg(), &MI, &MRI, LIS)) {
if (UseMI->isImplicitDef())
return true;
if (UseMI->isRegSequence()) {
for (unsigned i = 1, e = UseMI->getNumOperands(); i < e; i += 2) {
- MachineInstr *SourceMI = MRI.getVRegDef(UseMI->getOperand(i).getReg());
+ const MachineInstr *SourceMI =
+ getReachingDefMI(UseMI->getOperand(i).getReg(), UseMI, &MRI, LIS);
if (!SourceMI || !SourceMI->isImplicitDef())
return false;
}
@@ -347,7 +393,7 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
/// Return the fields and properties demanded by the provided instruction.
DemandedFields getDemanded(const MachineInstr &MI,
const MachineRegisterInfo *MRI,
- const RISCVSubtarget *ST) {
+ const RISCVSubtarget *ST, const LiveIntervals *LIS) {
// Warning: This function has to work on both the lowered (i.e. post
// emitVSETVLIs) and pre-lowering forms. The main implication of this is
// that it can't use the value of a SEW, VL, or Policy operand as they might
@@ -409,7 +455,7 @@ DemandedFields getDemanded(const MachineInstr &MI,
// this for any tail agnostic operation, but we can't as TA requires
// tail lanes to either be the original value or -1. We are writing
// unknown bits to the lanes here.
- if (hasUndefinedMergeOp(MI, *MRI)) {
+ if (hasUndefinedMergeOp(MI, *MRI, LIS)) {
if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
else
@@ -505,13 +551,15 @@ class VSETVLIInfo {
bool getTailAgnostic() const { return TailAgnostic; }
bool getMaskAgnostic() const { return MaskAgnostic; }
- bool hasNonZeroAVL(const MachineRegisterInfo &MRI) const {
+ bool hasNonZeroAVL(const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) const {
if (hasAVLImm())
return getAVLImm() > 0;
if (hasAVLReg()) {
if (getAVLReg() == RISCV::X0)
return true;
- if (MachineInstr *MI = MRI.getVRegDef(getAVLReg());
+ if (MachineInstr *MI =
+ getReachingDefMI(getAVLReg(), (MachineInstr *)nullptr, &MRI, LIS);
MI && isNonZeroLoadImmediate(*MI))
return true;
return false;
@@ -520,10 +568,11 @@ class VSETVLIInfo {
}
bool hasEquallyZeroAVL(const VSETVLIInfo &Other,
- const MachineRegisterInfo &MRI) const {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) const {
if (hasSameAVL(Other))
return true;
- return (hasNonZeroAVL(MRI) && Other.hasNonZeroAVL(MRI));
+ return (hasNonZeroAVL(MRI, LIS) && Other.hasNonZeroAVL(MRI, LIS));
}
bool hasSameAVL(const VSETVLIInfo &Other) const {
@@ -602,7 +651,8 @@ class VSETVLIInfo {
// Require are compatible with the previous vsetvli instruction represented
// by this. MI is the instruction whose requirements we're considering.
bool isCompatible(const DemandedFields &Used, const VSETVLIInfo &Require,
- const MachineRegisterInfo &MRI) const {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) const {
assert(isValid() && Require.isValid() &&
"Can't compare invalid VSETVLIInfos");
assert(!Require.SEWLMULRatioOnly &&
@@ -618,7 +668,7 @@ class VSETVLIInfo {
if (Used.VLAny && !hasSameAVL(Require))
return false;
- if (Used.VLZeroness && !hasEquallyZeroAVL(Require, MRI))
+ if (Used.VLZeroness && !hasEquallyZeroAVL(Require, MRI, LIS))
return false;
return hasCompatibleVTYPE(Used, Require);
@@ -743,6 +793,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
const RISCVSubtarget *ST;
const TargetInstrInfo *TII;
MachineRegisterInfo *MRI;
+ LiveIntervals *LIS = nullptr;
std::vector<BlockData> BlockInfo;
std::queue<const MachineBasicBlock *> WorkList;
@@ -755,6 +806,15 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+
+ AU.addUsedIfAvailable<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addUsedIfAvailable<SlotIndexes>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addUsedIfAvailable<LiveDebugVariables>();
+ AU.addPreserved<LiveDebugVariables>();
+ AU.addPreserved<LiveStacks>();
+
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -763,8 +823,8 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
private:
bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
const VSETVLIInfo &CurInfo) const;
- bool needVSETVLIPHI(const VSETVLIInfo &Require,
- const MachineBasicBlock &MBB) const;
+ bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB,
+ const MachineInstr &MI) const;
void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
void insertVSETVLI(MachineBasicBlock &MBB,
@@ -820,12 +880,13 @@ static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
const RISCVSubtarget &ST,
- const MachineRegisterInfo *MRI) {
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
VSETVLIInfo InstrInfo;
bool TailAgnostic = true;
bool MaskAgnostic = true;
- if (!hasUndefinedMergeOp(MI, *MRI)) {
+ if (!hasUndefinedMergeOp(MI, *MRI, LIS)) {
// Start with undisturbed.
TailAgnostic = false;
MaskAgnostic = false;
@@ -891,7 +952,8 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
// register AVLs to avoid extending live ranges without being sure we can
// kill the original source reg entirely.
if (InstrInfo.hasAVLReg() && InstrInfo.getAVLReg().isVirtual()) {
- MachineInstr *DefMI = MRI->getVRegDef(InstrInfo.getAVLReg());
+ const MachineInstr *DefMI =
+ getReachingDefMI(InstrInfo.getAVLReg(), &MI, MRI, LIS);
if (DefMI && isVectorConfigInstr(*DefMI)) {
VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
@@ -911,6 +973,62 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
insertVSETVLI(MBB, MachineBasicBlock::iterator(&MI), DL, Info, PrevInfo);
}
+static void fixupModifyVRegLI(Register VReg, LiveIntervals *LIS) {
+ if (!LIS)
+ return;
+
+ if (LIS->hasInterval(VReg))
+ LIS->removeInterval(VReg);
+ LIS->createAndComputeVirtRegInterval(VReg);
+
+ // After needVSETVLIPHI, may raise
+ // "Multiple connected components in live interval"
+ // error.
+ auto &LI = LIS->getInterval(VReg);
+ SmallVector<LiveInterval *, 8> SplitLIs;
+ LIS->splitSeparateComponents(LI, SplitLIs);
+}
+
+static void getVRegFromMI(MachineInstr *MI, SmallVector<Register> &VRegs) {
+ for (auto &MO : MI->operands()) {
+ if (!MO.isReg() || MO.getReg() == 0 || !MO.getReg().isVirtual())
+ continue;
+ Register Reg = MO.getReg();
+ VRegs.push_back(Reg);
+ }
+}
+
+static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
+ LiveIntervals *LIS) {
+
+ if (!LIS)
+ return;
+
+ if (LIS->isNotInMIMap(*MI))
+ LIS->InsertMachineInstrInMaps(*MI);
+
+ SmallVector<Register> NeedFixupVReg;
+ getVRegFromMI(MI, NeedFixupVReg);
+
+ for (auto VReg : NeedFixupVReg)
+ fixupModifyVRegLI(VReg, LIS);
+}
+
+static void removeMIAndFixupModifyVRegLI(MachineInstr *MI, LiveIntervals *LIS) {
+
+ SmallVector<Register> NeedFixupVReg;
+ getVRegFromMI(MI, NeedFixupVReg);
+
+ MI->eraseFromParent();
+
+ if (!LIS)
+ return;
+
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ for (auto VReg : NeedFixupVReg)
+ fixupModifyVRegLI(VReg, LIS);
+}
+
void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc DL,
const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
@@ -920,11 +1038,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
// VLMAX.
if (Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -933,15 +1053,18 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// same, we can use the X0, X0 form.
if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg() &&
Info.getAVLReg().isVirtual()) {
- if (MachineInstr *DefMI = MRI->getVRegDef(Info.getAVLReg())) {
+ if (const MachineInstr *DefMI =
+ getReachingDefMI(Info.getAVLReg(), &(*InsertPt), MRI, LIS)) {
if (isVectorConfigInstr(*DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
}
@@ -950,10 +1073,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
}
if (Info.hasAVLImm()) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addImm(Info.getAVLImm())
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(Info.getAVLImm())
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -963,18 +1088,22 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// the previous vl to become invalid.
if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
Info.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
// Otherwise use an AVL of 1 to avoid depending on previous vl.
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addImm(1)
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(1)
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -990,10 +1119,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
Opcode = RISCV::PseudoVSETVLIX0;
}
- BuildMI(MBB, InsertPt, DL, TII->get(Opcode))
- .addReg(DestReg, RegState::Define | RegState::Dead)
- .addReg(AVLReg)
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(Opcode))
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(AVLReg, (LIS && MRI->def_empty(AVLReg)) ? RegState::Undef : 0)
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
}
static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
@@ -1006,12 +1137,13 @@ static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
const VSETVLIInfo &Require,
const VSETVLIInfo &CurInfo) const {
- assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, MRI));
+ assert(Require ==
+ computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, MRI, LIS));
if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
return true;
- DemandedFields Used = getDemanded(MI, MRI, ST);
+ DemandedFields Used = getDemanded(MI, MRI, ST, LIS);
// A slidedown/slideup with an *undefined* merge op can freely clobber
// elements not copied from the source vector (e.g. masked off, tail, or
@@ -1022,7 +1154,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// * The LMUL1 restriction is for machines whose latency may depend on VL.
// * As above, this is only legal for tail "undefined" not "agnostic".
if (isVSlideInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
- isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+ isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+ hasUndefinedMergeOp(MI, *MRI, LIS)) {
Used.VLAny = false;
Used.VLZeroness = true;
Used.LMUL = false;
@@ -1034,8 +1167,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place.
// Since a splat is non-constant time in LMUL, we do need to be careful to not
// increase the number of active vector registers (unlike for vmv.s.x.)
- if (isScalarSplatInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
- isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+ if (isScalarSplatInstr(MI) && Require.hasAVLImm() &&
+ Require.getAVLImm() == 1 && isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+ hasUndefinedMergeOp(MI, *MRI, LIS)) {
Used.LMUL = false;
Used.SEWLMULRatio = false;
Used.VLAny = false;
@@ -1046,7 +1180,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
Used.TailPolicy = false;
}
- if (CurInfo.isCompatible(Used, Require, *MRI))
+ if (CurInfo.isCompatible(Used, Require, *MRI, LIS))
return false;
// We didn't find a compatible value. If our AVL is a virtual register,
@@ -1055,7 +1189,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// VSETVLI here.
if (Require.hasAVLReg() && Require.getAVLReg().isVirtual() &&
CurInfo.hasCompatibleVTYPE(Used, Require)) {
- if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
+ if (const MachineInstr *DefMI =
+ getReachingDefMI(Require.getAVLReg(), &MI, MRI, LIS)) {
if (isVectorConfigInstr(*DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
@@ -1071,7 +1206,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
// places.
static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
- DemandedFields &Demanded) {
+ DemandedFields &Demanded,
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
VSETVLIInfo Info = NewInfo;
if (!Demanded.LMUL && !Demanded.SEWLMULRatio && PrevInfo.isValid() &&
@@ -1094,7 +1231,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
if (!RISCVII::hasSEWOp(TSFlags))
return;
- const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, MRI);
+ const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, MRI, LIS);
assert(NewInfo.isValid() && !NewInfo.isUnknown());
if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
return;
@@ -1103,8 +1240,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
if (!Info.isValid() || Info.isUnknown())
Info = NewInfo;
- DemandedFields Demanded = getDemanded(MI, MRI, ST);
- const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded);
+ DemandedFields Demanded = getDemanded(MI, MRI, ST, LIS);
+ const VSETVLIInfo IncomingInfo =
+ adjustIncoming(PrevInfo, NewInfo, Demanded, MRI, LIS);
// If MI only demands that VL has the same zeroness, we only need to set the
// AVL if the zeroness differs. This removes a vsetvli entirely if the types
@@ -1113,7 +1251,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
// variant, so we avoid the transform to prevent extending live range of an
// avl register operand.
// TODO: We can probably relax this for immediates.
- bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo, *MRI) &&
+ bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo, *MRI, LIS) &&
IncomingInfo.hasSameVLMAX(PrevInfo);
if (Demanded.VLAny || (Demanded.VLZeroness && !EquallyZero))
Info.setAVL(IncomingInfo);
@@ -1236,7 +1374,8 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
// be unneeded if the AVL is a phi node where all incoming values are VL
// outputs from the last VSETVLI in their respective basic blocks.
bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
- const MachineBasicBlock &MBB) const {
+ const MachineBasicBlock &MBB,
+ const MachineInstr &MI) const {
if (DisableInsertVSETVLPHIOpt)
return true;
@@ -1247,32 +1386,67 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
if (!AVLReg.isVirtual())
return true;
- // We need the AVL to be produce by a PHI node in this basic block.
- MachineInstr *PHI = MRI->getVRegDef(AVLReg);
- if (!PHI || PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
- return true;
+ if (!MRI->isSSA()) {
- for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
- PHIOp += 2) {
- Register InReg = PHI->getOperand(PHIOp).getReg();
- MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
- const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
- // If the exit from the predecessor has the VTYPE we are looking for
- // we might be able to avoid a VSETVLI.
- if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+ // For O0
+ if (!LIS)
return true;
- // We need the PHI input to the be the output of a VSET(I)VLI.
- MachineInstr *DefMI = MRI->getVRegDef(InReg);
- if (!DefMI || !isVectorConfigInstr(*DefMI))
+ LiveRange &LR = LIS->getInterval(Require.getAVLReg());
+ SlotIndexes *SIs = LIS->getSlotIndexes();
+ SlotIndex SI = SIs->getInstructionIndex(MI);
+ VNInfo *Valno = LR.getVNInfoAt(SI);
+ if (!Valno || !Valno->isPHIDef())
return true;
- // We found a VSET(I)VLI make sure it matches the output of the
- // predecessor block.
- VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
- if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
- !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+ for (auto *PredMBB : MBB.predecessors()) {
+ const BlockData &PBBInfo = BlockInfo[PredMBB->getNumber()];
+ if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+ return true;
+
+ const VNInfo *Value = LR.getVNInfoBefore(LIS->getMBBEndIdx(PredMBB));
+ if (!Value)
+ return true;
+
+ // TODO: DefMI is COPY in most case, maybe we should search
+ // until encouter non-COPY node.
+ MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def);
+ if (!DefMI || !isVectorConfigInstr(*DefMI))
+ return true;
+
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+ if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
+ !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+ return true;
+ }
+ } else {
+ // We need the AVL to be produce by a PHI node in this basic block.
+ const MachineInstr *PHI = getReachingDefMI(AVLReg, &MI, MRI, LIS);
+ if (!PHI || PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
return true;
+
+ for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
+ PHIOp += 2) {
+ Register InReg = PHI->getOperand(PHIOp).getReg();
+ MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
+ const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
+ // If the exit from the predecessor has the VTYPE we are looking for
+ // we might be able to avoid a VSETVLI.
+ if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+ return true;
+
+ // We need the PHI input to the be the output of a VSET(I)VLI.
+ const MachineInstr *DefMI = getReachingDefMI(InReg, PHI, MRI, LIS);
+ if (!DefMI || !isVectorConfigInstr(*DefMI))
+ return true;
+
+ // We found a VSET(I)VLI make sure it matches the output of the
+ // predecessor block.
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+ if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
+ !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+ return true;
+ }
}
// If all the incoming values to the PHI checked out, we don't need
@@ -1310,7 +1484,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// wouldn't be used and VL/VTYPE registers are correct. Note that
// we *do* need to model the state as if it changed as while the
// register contents are unchanged, the abstract model can change.
- if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB))
+ if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB, MI))
insertVSETVLI(MBB, MI, CurInfo, PrevInfo);
PrefixTransparent = false;
}
@@ -1319,9 +1493,11 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
if (VLOp.isReg()) {
Register Reg = VLOp.getReg();
- MachineInstr *VLOpDef = MRI->getVRegDef(Reg);
+ MachineInstr *VLOpDef = getReachingDefMI(Reg, &MI, MRI, LIS);
// Erase the AVL operand from the instruction.
+ Register VLOpReg = VLOp.getReg();
+ bool IsVirtVLOpReg = VLOp.getReg().isVirtual();
VLOp.setReg(RISCV::NoRegister);
VLOp.setIsKill(false);
@@ -1331,7 +1507,9 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// dead now.
if (VLOpDef && TII->isAddImmediate(*VLOpDef, Reg) &&
MRI->use_nodbg_empty(Reg))
- VLOpDef->eraseFromParent();
+ removeMIAndFixupModifyVRegLI(VLOpDef, LIS);
+ if (IsVirtVLOpReg)
+ fixupModifyVRegLI(VLOpReg, LIS);
}
MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false,
/*isImp*/ true));
@@ -1417,7 +1595,10 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// we need to prove the value is available at the point we're going
// to insert the vsetvli at.
if (AvailableInfo.hasAVLReg() && RISCV::X0 != AvailableInfo.getAVLReg()) {
- MachineInstr *AVLDefMI = MRI->getVRegDef(AvailableInfo.getAVLReg());
+ const MachineInstr *AVLDefMI = getReachingDefMI(
+ AvailableInfo.getAVLReg(),
+ UnavailablePred->empty() ? nullptr : &UnavailablePred->instr_back(),
+ MRI, LIS);
if (!AVLDefMI)
return;
// This is an inline dominance check which covers the case of
@@ -1492,7 +1673,8 @@ static void doUnion(DemandedFields &A, DemandedFields B) {
static bool canMutatePriorConfig(const MachineInstr &PrevMI,
const MachineInstr &MI,
const DemandedFields &Used,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
// If the VL values aren't equal, return false if either a) the former is
// demanded, or b) we can't rewrite the former to be the later for
// implementation reasons.
@@ -1504,13 +1686,12 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
if (isVLPreservingConfig(PrevMI))
return false;
if (!getInfoForVSETVLI(PrevMI).hasEquallyZeroAVL(getInfoForVSETVLI(MI),
- MRI))
+ MRI, LIS))
return false;
}
auto &AVL = MI.getOperand(1);
auto &PrevAVL = PrevMI.getOperand(1);
- assert(MRI.isSSA());
// If the AVL is a register, we need to make sure MI's AVL dominates PrevMI.
// For now just check that PrevMI uses the same virtual register.
@@ -1538,10 +1719,12 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
Used.demandVL();
Used.demandVTYPE();
SmallVector<MachineInstr*> ToDelete;
+ SmallVector<MachineInstr *> MIInBetween;
for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
if (!isVectorConfigInstr(MI)) {
- doUnion(Used, getDemanded(MI, MRI, ST));
+ MIInBetween.push_back(&MI);
+ doUnion(Used, getDemanded(MI, MRI, ST, LIS));
continue;
}
@@ -1551,11 +1734,29 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
Used.demandVL();
if (NextMI) {
+
+ // A tail undefined vmv.v.i/x or vfmv.v.f with VL=1 can be treated in the
+ // same semantically as vmv.s.x.
+ if (MIInBetween.size() == 1 && isScalarSplatInstr(*MIInBetween[0]) &&
+ MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 1 &&
+ isLMUL1OrSmaller(RISCVVType::getVLMUL(MI.getOperand(2).getImm())) &&
+ hasUndefinedMergeOp(*MIInBetween[0], *MRI, LIS)) {
+ Used.LMUL = false;
+ Used.SEWLMULRatio = false;
+ Used.VLAny = false;
+ if (isFloatScalarMoveOrScalarSplatInstr(*MIInBetween[0]) &&
+ !ST->hasVInstructionsF64())
+ Used.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
+ else
+ Used.SEW = DemandedFields::SEWGreaterThanOrEqual;
+ Used.TailPolicy = false;
+ }
+
if (!Used.usedVL() && !Used.usedVTYPE()) {
ToDelete.push_back(&MI);
// Leave NextMI unchanged
continue;
- } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
+ } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI, LIS)) {
if (!isVLPreservingConfig(*NextMI)) {
MI.getOperand(0).setReg(NextMI->getOperand(0).getReg());
MI.getOperand(0).setIsDead(false);
@@ -1570,7 +1771,7 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
MRI->use_nodbg_empty(OldVLReg))
- VLOpDef->eraseFromParent();
+ removeMIAndFixupModifyVRegLI(VLOpDef, LIS);
}
MI.setDesc(NextMI->getDesc());
}
@@ -1580,12 +1781,14 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
}
}
NextMI = &MI;
- Used = getDemanded(MI, MRI, ST);
+ Used = getDemanded(MI, MRI, ST, LIS);
+ MIInBetween.clear();
}
NumRemovedVSETVL += ToDelete.size();
+
for (auto *MI : ToDelete)
- MI->eraseFromParent();
+ removeMIAndFixupModifyVRegLI(MI, LIS);
}
void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
@@ -1593,11 +1796,16 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
MachineInstr &MI = *I++;
if (RISCV::isFaultFirstLoad(MI)) {
Register VLOutput = MI.getOperand(1).getReg();
- if (!MRI->use_nodbg_empty(VLOutput))
- BuildMI(MBB, I, MI.getDebugLoc(), TII->get(RISCV::PseudoReadVL),
- VLOutput);
+ bool IsVirtual = MI.getOperand(1).getReg().isVirtual();
+ if (!MRI->use_nodbg_empty(VLOutput)) {
+ auto NeedFixupMI = BuildMI(MBB, I, MI.getDebugLoc(),
+ TII->get(RISCV::PseudoReadVL), VLOutput);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ }
// We don't use the vl output of the VLEFF/VLSEGFF anymore.
MI.getOperand(1).setReg(RISCV::X0);
+ if (IsVirtual)
+ fixupModifyVRegLI(VLOutput, LIS);
}
}
}
@@ -1612,6 +1820,7 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
assert(BlockInfo.empty() && "Expect empty block infos");
BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae1a6f179a49e3..d4bece728dd187 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -101,6 +101,10 @@ static cl::opt<bool> EnableMISchedLoadClustering(
cl::desc("Enable load clustering in the machine scheduler"),
cl::init(false));
+static cl::opt<bool> EnableVSETVLIAfterRVVRegAlloc(
+ "riscv-vsetvli-after-rvv-regalloc", cl::Hidden,
+ cl::desc("vsetvl insertion after rvv regalloc"), cl::init(false));
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -392,8 +396,11 @@ FunctionPass *RISCVPassConfig::createRVVRegAllocPass(bool Optimized) {
}
bool RISCVPassConfig::addRegAssignAndRewriteFast() {
- if (EnableSplitRegAlloc)
+ if (EnableSplitRegAlloc) {
addPass(createRVVRegAllocPass(false));
+ if (EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
+ }
return TargetPassConfig::addRegAssignAndRewriteFast();
}
@@ -401,6 +408,8 @@ bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
if (EnableSplitRegAlloc) {
addPass(createRVVRegAllocPass(true));
addPass(createVirtRegRewriter(false));
+ if (EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
}
return TargetPassConfig::addRegAssignAndRewriteOptimized();
}
@@ -540,7 +549,8 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None)
addPass(createRISCVMergeBaseOffsetOptPass());
- addPass(createRISCVInsertVSETVLIPass());
+ if (!EnableSplitRegAlloc || !EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableRISCVDeadRegisterElimination)
addPass(createRISCVDeadRegisterDefinitionsPass());
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
index 8204cec7e27794..65270ea3a40eee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b -target-abi=lp64d \
-; RUN: --riscv-split-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: --riscv-split-regalloc=1 -riscv-vsetvli-after-rvv-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb) nounwind strictfp {
; CHECK-LABEL: fcmp_ole_vv_nxv2f16:
@@ -32,41 +32,39 @@ define dso_local void @test_interleave_cause_spill(ptr nocapture noundef %in) lo
; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 12
; CHECK-NEXT: vle32.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 16
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: addi a1, a0, 20
; CHECK-NEXT: vle32.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 20
+; CHECK-NEXT: vle32.v v0, (a1)
; CHECK-NEXT: addi a1, a0, 24
; CHECK-NEXT: vle32.v v16, (a1)
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vadd.vv v24, v8, v24
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vadd.vv v20, v0, v4
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vadd.vv v8, v8, v24
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vadd.vv v16, v0, v16
+; CHECK-NEXT: vadd.vv v20, v4, v0
+; CHECK-NEXT: vadd.vv v16, v4, v16
; CHECK-NEXT: addi a1, a0, 40
; CHECK-NEXT: vse32.v v20, (a1)
; CHECK-NEXT: addi a1, a0, 44
; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 48
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: addi a0, a0, 52
; CHECK-NEXT: vse32.v v8, (a0)
@@ -321,18 +319,19 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; CHECK-LABEL: constant_folding_crash:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld a0, 8(a0)
-; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: seqz a0, a0
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v13, a0
-; CHECK-NEXT: vmsne.vi v0, v13, 0
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vmsne.vi v12, v12, 0
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmv1r.v v0, v13
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vrgather.vi v9, v8, 0
; CHECK-NEXT: vmsne.vi v0, v9, 0
@@ -411,13 +410,13 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-NEXT: sltu a0, a0, a1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: and a0, a0, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t
@@ -532,13 +531,14 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 5
; CHECK-NEXT: vmv.s.x v10, a2
+; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v12, (a0)
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 6
; CHECK-NEXT: vmv.s.x v10, a3
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
-; CHECK-NEXT: vle32.v v12, (a0)
; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vadd.vv v8, v8, v12
; CHECK-NEXT: ret
@@ -569,14 +569,14 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
; CHECK-LABEL: insert_v8i32_v2i32_2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a1)
; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vslideup.vi v8, v10, 2
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vse32.v v10, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
%vec = load <8 x i32>, ptr %vp
@@ -592,13 +592,14 @@ define void @buildvec_seq_v9i8(ptr %x) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 3
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vmv.v.i v9, 3
; CHECK-NEXT: li a1, 146
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v8, v8, 2, v0
+; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v8, v9, 2, v0
; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -626,373 +627,422 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 52
+; CHECK-NEXT: li a3, 74
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb
; CHECK-NEXT: addi a2, a1, 256
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 27
+; CHECK-NEXT: li a3, 25
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: addi a2, a1, 128
-; CHECK-NEXT: vle64.v v8, (a2)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 35
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a3, a1, 6
+; CHECK-NEXT: add a1, a3, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT: vrgather.vi v8, v16, 4
+; CHECK-NEXT: vrgather.vi v12, v16, 4
; CHECK-NEXT: li a1, 128
-; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v16, 8
+; CHECK-NEXT: vslidedown.vi v16, v16, 8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: li a3, 49
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vrgather.vi v8, v24, 2, v0.t
-; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vrgather.vi v12, v16, 2, v0.t
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vmul.vx v2, v8, a1
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmul.vx v2, v10, a1
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 57
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv.s.x v7, a1
+; CHECK-NEXT: vadd.vi v10, v2, -16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v2
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 56
-; CHECK-NEXT: vmv.s.x v1, a1
-; CHECK-NEXT: vadd.vi v16, v2, -16
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v2
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v10, v0.t
; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vmv.v.v v12, v16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 4
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 21
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 5
-; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v12, v16, 5
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmv1r.v v6, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 49
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 3, v0.t
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vrgather.vi v12, v16, 3, v0.t
+; CHECK-NEXT: vmv.v.v v28, v12
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs2r.v v2, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vadd.vi v24, v2, 1
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v26, v2, -15
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v24, v2, -15
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs2r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v26, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v28, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: lui a1, 16
+; CHECK-NEXT: addi a1, a1, 7
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 6
+; CHECK-NEXT: vmv.v.x v10, a1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v16, v9
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v2, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgatherei16.vv v12, v16, v10
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 41
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vi v4, v2, 2
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: vmv.s.x v4, a1
-; CHECK-NEXT: vadd.vi v16, v2, -14
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v12, 6
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: vrgather.vi v12, v16, 2
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 37
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v24, v16
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v12, v16, 3
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v24, v2, 2
+; CHECK-NEXT: vadd.vi v4, v2, -14
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: slli a2, a1, 6
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v16, v24, 4, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v4, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 3
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 25
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v28, v2, 3
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: li a2, 49
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v28
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v16, v2, -13
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: lui a1, 16
-; CHECK-NEXT: addi a1, a1, 7
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v20, v16, 4, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v4, v2, 3
+; CHECK-NEXT: vadd.vi v8, v2, -13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v24, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
-; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v24, v16, 5, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v24, v8
+; CHECK-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: slli a2, a1, 3
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 49
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 41
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v8, v24, 5, v0.t
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 41
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: lui a1, 96
+; CHECK-NEXT: li a2, 192
+; CHECK-NEXT: vmv.s.x v28, a2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v8, a1
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT: li a1, 192
-; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vmv1r.v v0, v28
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: li a2, 37
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 37
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v4, v24, 2
-; CHECK-NEXT: vrgatherei16.vv v4, v16, v8, v0.t
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 28
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v26, v2, 4
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v30, v2, 4
+; CHECK-NEXT: vadd.vi v6, v2, -12
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v26
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 28
-; CHECK-NEXT: vmv.s.x v1, a1
-; CHECK-NEXT: vadd.vi v16, v2, -12
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v30
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v6, v0.t
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: lui a1, 112
; CHECK-NEXT: addi a1, a1, 1
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vmv1r.v v0, v28
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 3
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: li a2, 45
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v16, v24
+; CHECK-NEXT: vmv2r.v v8, v2
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vadd.vi v12, v2, 5
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v24, v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v2, v8, -11
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v12, v2, -11
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v24, v8, v2, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 41
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
+; CHECK-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: vmv.v.v v12, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 37
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv.v.v v20, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv.v.v v8, v24
; CHECK-NEXT: addi a1, a0, 320
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vse64.v v4, (a1)
+; CHECK-NEXT: vse64.v v20, (a1)
; CHECK-NEXT: addi a1, a0, 192
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a3, a2, 1
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: vse64.v v12, (a1)
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vse64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 64
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a3, a2, 3
-; CHECK-NEXT: sub a2, a3, a2
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vse64.v v8, (a1)
-; CHECK-NEXT: addi a1, a0, 64
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 11
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 4
+; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 4
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 21
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 52
+; CHECK-NEXT: li a1, 74
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
@@ -1020,110 +1070,114 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 56
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a2, a2, 6
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; CHECK-NEXT: vmv1r.v v4, v0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 5
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v1, v0, 8
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v2, v0, 4
-; CHECK-NEXT: addi a2, a1, 512
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v29, v0, 8
+; CHECK-NEXT: addi a2, a1, 512
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v27, v1, 4
-; CHECK-NEXT: addi a2, a1, 640
+; CHECK-NEXT: vslidedown.vi v27, v29, 4
+; CHECK-NEXT: addi a3, a1, 640
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: vle64.v v8, (a3)
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: addi a2, a7, -64
-; CHECK-NEXT: sltu a3, a7, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a4, a3, a2
-; CHECK-NEXT: addi a2, a4, -32
-; CHECK-NEXT: sltu a3, a4, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a3, a3, a2
-; CHECK-NEXT: addi a2, a3, -16
-; CHECK-NEXT: sltu a5, a3, a2
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a2, a5, a2
; CHECK-NEXT: vslidedown.vi v0, v27, 2
-; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: addi a3, a7, -64
+; CHECK-NEXT: sltu a4, a7, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a4, a4, a3
+; CHECK-NEXT: addi a3, a4, -32
+; CHECK-NEXT: sltu a5, a4, a3
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a3, a5, a3
+; CHECK-NEXT: addi a5, a3, -16
+; CHECK-NEXT: sltu a6, a3, a5
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a5, a6, a5
+; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 24
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a5, a1, 128
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v26, v7, 4
; CHECK-NEXT: bltu a3, a2, .LBB14_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
; CHECK-NEXT: .LBB14_2:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v12, v2, 2
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a5)
-; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT: li a3, 64
-; CHECK-NEXT: vmv1r.v v0, v27
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 40
-; CHECK-NEXT: mul a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT: vle64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: li a6, 48
; CHECK-NEXT: mul a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a7, a3, .LBB14_4
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v28, v26, 2
+; CHECK-NEXT: li a5, 64
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a6, 56
+; CHECK-NEXT: mul a3, a3, a6
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: bltu a7, a5, .LBB14_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: li a7, 64
+; CHECK-NEXT: li a6, 64
; CHECK-NEXT: .LBB14_4:
; CHECK-NEXT: addi a5, a1, 384
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: addi a6, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a6, a7, -32
-; CHECK-NEXT: sltu t0, a7, a6
-; CHECK-NEXT: addi t0, t0, -1
-; CHECK-NEXT: and a6, t0, a6
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi t0, a6, -32
+; CHECK-NEXT: sltu a6, a6, t0
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a6, a6, t0
; CHECK-NEXT: addi t0, a6, -16
; CHECK-NEXT: sltu t1, a6, t0
; CHECK-NEXT: addi t1, t1, -1
; CHECK-NEXT: and t0, t1, t0
+; CHECK-NEXT: vmv1r.v v0, v28
+; CHECK-NEXT: csrr t1, vlenb
+; CHECK-NEXT: li t2, 48
+; CHECK-NEXT: mul t1, t1, t2
+; CHECK-NEXT: add t1, sp, t1
+; CHECK-NEXT: addi t1, t1, 16
+; CHECK-NEXT: vl8r.v v16, (t1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
; CHECK-NEXT: csrr t0, vlenb
-; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: slli t0, t0, 4
; CHECK-NEXT: add t0, sp, t0
; CHECK-NEXT: addi t0, t0, 16
; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
@@ -1131,131 +1185,143 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: li a6, 16
; CHECK-NEXT: .LBB14_6:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v3, v1, 2
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a1, 256
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v27, v29, 2
+; CHECK-NEXT: vmv1r.v v0, v26
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 3
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v2
-; CHECK-NEXT: addi a5, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: li a6, 48
; CHECK-NEXT: mul a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a5, a4
; CHECK-NEXT: bltu a4, a3, .LBB14_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a4, 32
+; CHECK-NEXT: li a5, 32
; CHECK-NEXT: .LBB14_8:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a1, a4, -16
-; CHECK-NEXT: sltu a5, a4, a1
+; CHECK-NEXT: addi a1, a5, -16
+; CHECK-NEXT: sltu a5, a5, a1
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a1, a5, a1
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v3
; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
; CHECK-NEXT: bltu a4, a2, .LBB14_10
; CHECK-NEXT: # %bb.9:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: .LBB14_10:
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v5, v4, 2
+; CHECK-NEXT: vslidedown.vi v6, v7, 2
+; CHECK-NEXT: vmv1r.v v0, v29
; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: mv a1, a7
; CHECK-NEXT: bltu a7, a3, .LBB14_12
; CHECK-NEXT: # %bb.11:
-; CHECK-NEXT: li a7, 32
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB14_12:
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 56
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 24
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 48
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 48
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 40
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 56
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 48
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 40
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 48
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; CHECK-NEXT: vmv4r.v v8, v0
; CHECK-NEXT: vslideup.vi v8, v24, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a7, -16
-; CHECK-NEXT: sltu a4, a7, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 24
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a4, a1, -16
+; CHECK-NEXT: sltu a1, a1, a4
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a1, a1, a4
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v5
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
; CHECK-NEXT: bltu a7, a2, .LBB14_14
; CHECK-NEXT: # %bb.13:
; CHECK-NEXT: li a7, 16
; CHECK-NEXT: .LBB14_14:
-; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a2, 40
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vslideup.vi v24, v16, 16
; CHECK-NEXT: vse32.v v24, (a0)
; CHECK-NEXT: addi a1, a0, 256
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: li a3, 48
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
@@ -1263,15 +1329,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: addi a0, a0, 384
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 48
+; CHECK-NEXT: li a2, 56
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 56
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 6
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1382,12 +1447,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
@@ -1404,25 +1469,57 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: slli a2, a2, 5
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vl8re64.v v24, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v16, (a1)
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vsoxei64.v v8, (zero), v24, v0.t
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a0
+; CHECK-NEXT: vslidedown.vx v24, v0, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsoxei64.v v8, (zero), v16, v0.t
+; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1446,23 +1543,22 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a0, a1
-; CHECK-NEXT: vl8re64.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vadd.vv v16, v8, v8
-; CHECK-NEXT: vrgather.vv v8, v0, v16
+; CHECK-NEXT: mul a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v16, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
@@ -1470,34 +1566,47 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vrgather.vv v24, v8, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vv v8, v0, v16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vadd.vi v8, v16, 1
-; CHECK-NEXT: vrgather.vv v16, v0, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v16, v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vrgather.vv v24, v0, v8
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v28, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v28, v8
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
; CHECK-NEXT: vmv8r.v v8, v24
@@ -1527,7 +1636,15 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 2
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
@@ -1537,16 +1654,22 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v28, v16, v0.t
+; CHECK-NEXT: vfncvt.rtz.x.f.w v20, v24, v0.t
; CHECK-NEXT: bltu a0, a1, .LBB22_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v8, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i16> %v
@@ -1557,26 +1680,27 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
-; CHECK-NEXT: vsext.vf4 v16, v8
-; CHECK-NEXT: vsll.vi v24, v16, 3
; CHECK-NEXT: vsext.vf4 v16, v10
; CHECK-NEXT: vsll.vi v16, v16, 3
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: srli a4, a2, 3
; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a4
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vluxei64.v v16, (a0), v16, v0.t
+; CHECK-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsext.vf4 v24, v8
+; CHECK-NEXT: vsll.vi v24, v24, 3
; CHECK-NEXT: bltu a1, a2, .LBB23_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vluxei64.v v8, (a0), v24, v0.t
; CHECK-NEXT: ret
%ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i16> %idxs
@@ -1591,11 +1715,17 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1604,35 +1734,51 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: slli a1, a3, 3
; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vl8re32.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: slli a1, a3, 1
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: sltu a5, a2, a4
; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: vl8re32.v v0, (a0)
+; CHECK-NEXT: vl8re32.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a3
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: and a4, a5, a4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
; CHECK-NEXT: bltu a2, a1, .LBB24_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB24_2:
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1645,9 +1791,10 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; CHECK-NEXT: vadd.vv v12, v12, v12
-; CHECK-NEXT: vs4r.v v12, (a0)
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: ret
%index = add <vscale x 4 x i64> %x, %x
store <vscale x 4 x i64> %index, <vscale x 4 x i64>* %y
@@ -1659,7 +1806,15 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <v
define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vsitofp_nxv32f16_nxv32i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 2
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
@@ -1669,16 +1824,22 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfncvt.f.x.w v28, v16, v0.t
+; CHECK-NEXT: vfncvt.f.x.w v20, v24, v0.t
; CHECK-NEXT: bltu a0, a1, .LBB26_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB26_2:
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vfncvt.f.x.w v24, v8, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vfncvt.f.x.w v16, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
@@ -1693,9 +1854,9 @@ define <4 x float> @tail_vmv_v_i_treat_as_vmv_s_x(<8 x float> %x, <8 x float> %y
; CHECK-NEXT: vmul.vx v14, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vadd.vi v8, v14, -14
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vadd.vi v8, v14, -14
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v12
>From 76c629207011c74e2850ff1b8b127efa8bd506c9 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 11 Mar 2024 06:58:25 -0700
Subject: [PATCH 3/5] Also fix LIS index after insert
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 86073c881486b0..b10b12c85aa744 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1007,6 +1007,7 @@ static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
if (LIS->isNotInMIMap(*MI))
LIS->InsertMachineInstrInMaps(*MI);
+ LIS->handleMove(*MI);
SmallVector<Register> NeedFixupVReg;
getVRegFromMI(MI, NeedFixupVReg);
>From 8cf0423982e3517557ec9beae2acb4b52e349036 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 1 Apr 2024 02:52:47 -0700
Subject: [PATCH 4/5] Make sure canMutatePriorConfig use the same AVLReg def
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index b10b12c85aa744..1e0ddf250ddd1b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1699,8 +1699,18 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
if (AVL.isReg() && AVL.getReg() != RISCV::X0) {
if (AVL.getReg().isPhysical())
return false;
- if (!PrevAVL.isReg() || PrevAVL.getReg() != AVL.getReg())
+ if (!PrevAVL.isReg())
return false;
+ if (MRI.isSSA()) {
+ if (PrevAVL.getReg() != AVL.getReg())
+ return false;
+ } else {
+ auto *AVLDef = getReachingDefMI(AVL.getReg(), &MI, &MRI, LIS);
+ auto *PrevAVLDef =
+ getReachingDefMI(PrevAVL.getReg(), &PrevMI, &MRI, LIS);
+ if (!AVLDef->isIdenticalTo(*PrevAVLDef))
+ return false;
+ }
}
}
>From d3f13af9d85bd81e3e07c65905506fbd6ffd0f1c Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 1 Apr 2024 02:55:21 -0700
Subject: [PATCH 5/5] Update getReachingDefMI start point in doPRE
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1e0ddf250ddd1b..300965a136d605 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1596,10 +1596,9 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// we need to prove the value is available at the point we're going
// to insert the vsetvli at.
if (AvailableInfo.hasAVLReg() && RISCV::X0 != AvailableInfo.getAVLReg()) {
- const MachineInstr *AVLDefMI = getReachingDefMI(
- AvailableInfo.getAVLReg(),
- UnavailablePred->empty() ? nullptr : &UnavailablePred->instr_back(),
- MRI, LIS);
+ const MachineInstr *AVLDefMI =
+ getReachingDefMI(AvailableInfo.getAVLReg(),
+ MBB.empty() ? nullptr : &MBB.instr_front(), MRI, LIS);
if (!AVLDefMI)
return;
// This is an inline dominance check which covers the case of
More information about the llvm-commits
mailing list