[llvm] [RISCV] Support postRA vsetvl insertion pass (PR #70549)
Piyou Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 29 05:40:04 PDT 2024
https://github.com/BeMg updated https://github.com/llvm/llvm-project/pull/70549
>From 77951c9c741ad1c1f18a1b936708c61b832d6f1d Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Wed, 24 Jan 2024 22:46:58 -0800
Subject: [PATCH 01/19] Precommit for testcase
---
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 1733 +++++++++++++++++
1 file changed, 1733 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
new file mode 100644
index 00000000000000..8204cec7e27794
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -0,0 +1,1733 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b -target-abi=lp64d \
+; RUN: --riscv-split-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb) nounwind strictfp {
+; CHECK-LABEL: fcmp_ole_vv_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vmfeq.vv v10, v9, v9
+; CHECK-NEXT: vmfeq.vv v11, v8, v8
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: vmfle.vv v0, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
+ ret <vscale x 2 x i1> %1
+}
+
+define dso_local void @test_interleave_cause_spill(ptr nocapture noundef %in) local_unnamed_addr #0 {
+; CHECK-LABEL: test_interleave_cause_spill:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: addi a1, a0, 4
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 8
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: addi a1, a0, 12
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a0, 16
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v0, (a1)
+; CHECK-NEXT: addi a1, a0, 20
+; CHECK-NEXT: vle32.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 24
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vv v24, v8, v24
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vadd.vv v20, v0, v4
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vadd.vv v8, v8, v24
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vadd.vv v16, v0, v16
+; CHECK-NEXT: addi a1, a0, 40
+; CHECK-NEXT: vse32.v v20, (a1)
+; CHECK-NEXT: addi a1, a0, 44
+; CHECK-NEXT: vse32.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 48
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v16, (a1)
+; CHECK-NEXT: addi a0, a0, 52
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+entry:
+ %add.ptr = getelementptr inbounds i32, ptr %in, i64 1
+ %0 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr, i64 32)
+ %add.ptr1 = getelementptr inbounds i32, ptr %in, i64 2
+ %1 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr1, i64 32)
+ %add.ptr2 = getelementptr inbounds i32, ptr %in, i64 3
+ %2 = tail call <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32> poison, ptr nonnull %add.ptr2, i64 32)
+ %add.ptr3 = getelementptr inbounds i32, ptr %in, i64 4
+ %3 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr3, i64 32)
+ %add.ptr4 = getelementptr inbounds i32, ptr %in, i64 5
+ %4 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr4, i64 32)
+ %add.ptr5 = getelementptr inbounds i32, ptr %in, i64 6
+ %5 = tail call <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32> poison, ptr nonnull %add.ptr5, i64 32)
+ %6 = tail call <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 32)
+ %7 = tail call <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> %3, <vscale x 8 x i32> %4, i64 32)
+ %8 = tail call <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %0, <vscale x 16 x i32> %2, i64 32)
+ %9 = tail call <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> %3, <vscale x 8 x i32> %5, i64 32)
+ %add.ptr6 = getelementptr inbounds i32, ptr %in, i64 10
+ tail call void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32> %7, ptr nonnull %add.ptr6, i64 32)
+ %add.ptr7 = getelementptr inbounds i32, ptr %in, i64 11
+ tail call void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32> %9, ptr nonnull %add.ptr7, i64 32)
+ %add.ptr8 = getelementptr inbounds i32, ptr %in, i64 12
+ tail call void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32> %6, ptr nonnull %add.ptr8, i64 32)
+ %add.ptr9 = getelementptr inbounds i32, ptr %in, i64 13
+ tail call void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32> %8, ptr nonnull %add.ptr9, i64 32)
+ ret void
+}
+
+define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlz_v15i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsrl.vx v16, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: lui a0, 4112
+; CHECK-NEXT: addiw a0, a0, 257
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 56
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
+ ret <15 x i64> %v
+}
+
+define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_nxv7i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 7 x i64> @llvm.vp.bitreverse.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
+ ret <vscale x 7 x i64> %v
+}
+
+define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_nxv8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i64> @llvm.vp.bitreverse.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i64> %v
+}
+
+define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b, <4 x i1> %sel) {
+; CHECK-LABEL: constant_folding_crash:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld a0, 8(a0)
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v13, a0
+; CHECK-NEXT: vmsne.vi v0, v13, 0
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 10
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+entry:
+ %sunkaddr = getelementptr i8, ptr %v54, i64 8
+ %v56 = load i64, ptr %sunkaddr, align 8
+ %trunc = and i64 %v56, 1
+ %cmp = icmp eq i64 %trunc, 0
+ %ptrs = select i1 %cmp, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b
+ %v67 = extractelement <4 x ptr> %ptrs, i64 0
+ %mask = shufflevector <4 x i1> %sel, <4 x i1> undef, <4 x i32> zeroinitializer
+ call void @llvm.masked.store.v4i32.p0(<4 x i32> <i32 10, i32 10, i32 10, i32 10>, ptr %v67, i32 16, <4 x i1> %mask)
+ ret void
+}
+
+define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctpop_nxv16i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 3
+; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v24, v0, a2
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: bltu a0, a1, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a2, 349525
+; CHECK-NEXT: addiw a2, a2, 1365
+; CHECK-NEXT: slli a3, a2, 32
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a3, 209715
+; CHECK-NEXT: addiw a3, a3, 819
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: lui a4, 61681
+; CHECK-NEXT: addiw a4, a4, -241
+; CHECK-NEXT: slli a5, a4, 32
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT: lui a5, 4112
+; CHECK-NEXT: addiw a5, a5, 257
+; CHECK-NEXT: slli a6, a5, 32
+; CHECK-NEXT: add a5, a5, a6
+; CHECK-NEXT: vmul.vx v8, v8, a5, v0.t
+; CHECK-NEXT: li a6, 56
+; CHECK-NEXT: vsrl.vx v8, v8, a6, v0.t
+; CHECK-NEXT: addi a7, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: sltu a0, a0, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vand.vx v8, v16, a3, v0.t
+; CHECK-NEXT: vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT: vmul.vx v8, v8, a5, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a6, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x i64> %v
+}
+
+define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_bitreverse_v15i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: lui a1, 4080
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v16, v16, 24, v0.t
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v24, v24, 8, v0.t
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 56
+; CHECK-NEXT: vsll.vx v24, v8, a2, v0.t
+; CHECK-NEXT: lui a3, 16
+; CHECK-NEXT: addiw a3, a3, -256
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: vand.vx v16, v8, a3, v0.t
+; CHECK-NEXT: vsll.vx v16, v16, a4, v0.t
+; CHECK-NEXT: vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsrl.vx v24, v8, a2, v0.t
+; CHECK-NEXT: vsrl.vx v16, v8, a4, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vor.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT: lui a0, 61681
+; CHECK-NEXT: addiw a0, a0, -241
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT: lui a0, 209715
+; CHECK-NEXT: addiw a0, a0, 819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1365
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
+ ret <15 x i64> %v
+}
+
+define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: add_constant_rhs_8xi32_partial:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
+; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 5
+; CHECK-NEXT: vmv.s.x v10, a2
+; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 6
+; CHECK-NEXT: vmv.s.x v10, a3
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vle32.v v12, (a0)
+; CHECK-NEXT: vslideup.vi v8, v10, 7
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
+ %vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+ %e0 = add i32 %a, 23
+ %e1 = add i32 %b, 25
+ %e2 = add i32 %c, 1
+ %e3 = add i32 %d, 2355
+ %v0 = insertelement <8 x i32> %vadd, i32 %e0, i32 4
+ %v1 = insertelement <8 x i32> %v0, i32 %e1, i32 5
+ %v2 = insertelement <8 x i32> %v1, i32 %e2, i32 6
+ %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 7
+ ret <8 x i32> %v3
+}
+
+define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
+; CHECK-LABEL: fp2si_v8f64_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vand.vi v8, v12, 1
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: ret
+ %z = fptosi <8 x double> %x to <8 x i1>
+ ret <8 x i1> %z
+}
+
+define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
+; CHECK-LABEL: insert_v8i32_v2i32_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vse32.v v10, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, ptr %svp
+ %vec = load <8 x i32>, ptr %vp
+ %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
+ store <8 x i32> %v, ptr %vp
+ ret void
+}
+
+define void @buildvec_seq_v9i8(ptr %x) {
+; CHECK-LABEL: buildvec_seq_v9i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 73
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 3
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: li a1, 146
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 2, v0
+; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma
+; CHECK-NEXT: vse8.v v8, (a0)
+; CHECK-NEXT: ret
+ store <9 x i8> <i8 1, i8 2, i8 3, i8 1, i8 2, i8 3, i8 1, i8 2, i8 3>, ptr %x
+ ret void
+}
+
+define <4 x i1> @load_large_vector(ptr %p) {
+; CHECK-LABEL: load_large_vector:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vlseg3e64.v v8, (a0)
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: ret
+ %l = load <12 x ptr>, ptr %p
+ %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %ret = icmp ne <4 x ptr> %s1, %s2
+ ret <4 x i1> %ret
+}
+
+define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
+; CHECK-LABEL: load_factor6_too_big:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 52
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: addi a2, a1, 256
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 27
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a2, a1, 128
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 35
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vrgather.vi v8, v16, 4
+; CHECK-NEXT: li a1, 128
+; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v16, 8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v8, v24, 2, v0.t
+; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vmul.vx v2, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v2
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vadd.vi v16, v2, -16
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 5
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 3, v0.t
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs2r.v v2, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vadd.vi v24, v2, 1
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v24, v2, -15
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs2r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v2, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vadd.vi v4, v2, 2
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vadd.vi v16, v2, -14
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v12, 6
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v24, v16
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v16, v24, 4, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v28, v2, 3
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v28
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v16, v2, -13
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: lui a1, 16
+; CHECK-NEXT: addi a1, a1, 7
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v24, v16, v12
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v24, v16, 5, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v24, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: lui a1, 96
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: li a1, 192
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v4, v24, 2
+; CHECK-NEXT: vrgatherei16.vv v4, v16, v8, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v26, v2, 4
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v26
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: li a1, 28
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vadd.vi v16, v2, -12
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: lui a1, 112
+; CHECK-NEXT: addi a1, a1, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vi v8, v16, 3
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v12, v2, 5
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v12, v2, -11
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: addi a1, a0, 320
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vse64.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 192
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a3, a2, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a3, a2, 3
+; CHECK-NEXT: sub a2, a3, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 64
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 11
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 52
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %interleaved.vec = load <48 x i64>, ptr %ptr
+ %v0 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
+ %v1 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
+ %v2 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
+ %v3 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
+ %v4 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
+ %v5 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
+ %res0 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} undef, <8 x i64> %v0, 0
+ %res1 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res0, <8 x i64> %v1, 1
+ %res2 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res1, <8 x i64> %v2, 2
+ %res3 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res2, <8 x i64> %v3, 3
+ %res4 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res3, <8 x i64> %v4, 4
+ %res5 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res4, <8 x i64> %v5, 5
+ ret {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res5
+}
+
+
+define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vtrunc_v128i32_v128i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 56
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT: vmv1r.v v4, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v1, v0, 8
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v2, v0, 4
+; CHECK-NEXT: addi a2, a1, 512
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v27, v1, 4
+; CHECK-NEXT: addi a2, a1, 640
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: addi a2, a7, -64
+; CHECK-NEXT: sltu a3, a7, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a4, a3, a2
+; CHECK-NEXT: addi a2, a4, -32
+; CHECK-NEXT: sltu a3, a4, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a2
+; CHECK-NEXT: addi a2, a3, -16
+; CHECK-NEXT: sltu a5, a3, a2
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a2, a5, a2
+; CHECK-NEXT: vslidedown.vi v0, v27, 2
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: addi a5, a1, 128
+; CHECK-NEXT: bltu a3, a2, .LBB14_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v2, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a5)
+; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
+; CHECK-NEXT: li a3, 64
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 48
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a7, a3, .LBB14_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: li a7, 64
+; CHECK-NEXT: .LBB14_4:
+; CHECK-NEXT: addi a5, a1, 384
+; CHECK-NEXT: li a3, 32
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: addi a6, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a6, a7, -32
+; CHECK-NEXT: sltu t0, a7, a6
+; CHECK-NEXT: addi t0, t0, -1
+; CHECK-NEXT: and a6, t0, a6
+; CHECK-NEXT: addi t0, a6, -16
+; CHECK-NEXT: sltu t1, a6, t0
+; CHECK-NEXT: addi t1, t1, -1
+; CHECK-NEXT: and t0, t1, t0
+; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a6, a2, .LBB14_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: li a6, 16
+; CHECK-NEXT: .LBB14_6:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v3, v1, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a1, a1, 256
+; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v2
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a4, a3, .LBB14_8
+; CHECK-NEXT: # %bb.7:
+; CHECK-NEXT: li a4, 32
+; CHECK-NEXT: .LBB14_8:
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a4, -16
+; CHECK-NEXT: sltu a5, a4, a1
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a1, a5, a1
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT: bltu a4, a2, .LBB14_10
+; CHECK-NEXT: # %bb.9:
+; CHECK-NEXT: li a4, 16
+; CHECK-NEXT: .LBB14_10:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v5, v4, 2
+; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: bltu a7, a3, .LBB14_12
+; CHECK-NEXT: # %bb.11:
+; CHECK-NEXT: li a7, 32
+; CHECK-NEXT: .LBB14_12:
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 48
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 48
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vslideup.vi v8, v16, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 40
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv4r.v v8, v0
+; CHECK-NEXT: vslideup.vi v8, v24, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a7, -16
+; CHECK-NEXT: sltu a4, a7, a1
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v5
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a1, a1, a4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: bltu a7, a2, .LBB14_14
+; CHECK-NEXT: # %bb.13:
+; CHECK-NEXT: li a7, 16
+; CHECK-NEXT: .LBB14_14:
+; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v24, v16, 16
+; CHECK-NEXT: vse32.v v24, (a0)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: addi a0, a0, 384
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 48
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> %a, <128 x i1> %m, i32 %vl)
+ ret <128 x i32> %v
+}
+
+
+define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v2i32_align1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vmseq.vi v8, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: andi a3, a2, 1
+; CHECK-NEXT: # implicit-def: $v8
+; CHECK-NEXT: beqz a3, .LBB15_2
+; CHECK-NEXT: # %bb.1: # %cond.load
+; CHECK-NEXT: lbu a3, 1(a0)
+; CHECK-NEXT: lbu a4, 0(a0)
+; CHECK-NEXT: lbu a5, 2(a0)
+; CHECK-NEXT: lb a6, 3(a0)
+; CHECK-NEXT: slli a3, a3, 8
+; CHECK-NEXT: or a3, a3, a4
+; CHECK-NEXT: slli a5, a5, 16
+; CHECK-NEXT: slli a6, a6, 24
+; CHECK-NEXT: or a4, a6, a5
+; CHECK-NEXT: or a3, a4, a3
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a3
+; CHECK-NEXT: .LBB15_2: # %else
+; CHECK-NEXT: andi a2, a2, 2
+; CHECK-NEXT: beqz a2, .LBB15_4
+; CHECK-NEXT: # %bb.3: # %cond.load1
+; CHECK-NEXT: lbu a2, 5(a0)
+; CHECK-NEXT: lbu a3, 4(a0)
+; CHECK-NEXT: lbu a4, 6(a0)
+; CHECK-NEXT: lb a0, 7(a0)
+; CHECK-NEXT: slli a2, a2, 8
+; CHECK-NEXT: or a2, a2, a3
+; CHECK-NEXT: slli a4, a4, 16
+; CHECK-NEXT: slli a0, a0, 24
+; CHECK-NEXT: or a0, a0, a4
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.s.x v9, a0
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: .LBB15_4: # %else2
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: ret
+ %mask = icmp eq <2 x i32> %m, zeroinitializer
+ %load = call <2 x i32> @llvm.masked.load.v2i32(ptr %a, i32 1, <2 x i1> %mask, <2 x i32> undef)
+ store <2 x i32> %load, ptr %res_ptr
+ ret void
+}
+
+define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_v11i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <11 x i64> poison, i64 %b, i32 0
+ %vb = shufflevector <11 x i64> %elt.head, <11 x i64> poison, <11 x i32> zeroinitializer
+ %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+ ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_v11i64_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: ret
+ %elt.head = insertelement <11 x i64> poison, i64 %b, i32 0
+ %vb = shufflevector <11 x i64> %elt.head, <11 x i64> poison, <11 x i32> zeroinitializer
+ %head = insertelement <11 x i1> poison, i1 true, i32 0
+ %m = shufflevector <11 x i1> %head, <11 x i1> poison, <11 x i32> zeroinitializer
+ %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+ ret <11 x i64> %v
+}
+
+define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 zeroext %evl) {
+; CHECK-LABEL: select_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: bltu a2, a1, .LBB18_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: .LBB18_2:
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: addi a0, a2, -16
+; CHECK-NEXT: sltu a1, a2, a0
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v0, v0, 2
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 %evl)
+ ret <32 x i64> %v
+}
+
+define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double> %val1, <vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptrs1, <vscale x 16 x i1> %m) {
+; CHECK-LABEL: mscatter_nxv16f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vl8re64.v v24, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v16, (a1)
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsoxei64.v v8, (zero), v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsoxei64.v v8, (zero), v16, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %p0 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> undef, <vscale x 8 x ptr> %ptrs0, i64 0)
+ %p1 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> %p0, <vscale x 8 x ptr> %ptrs1, i64 8)
+ %v0 = call <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double> undef, <vscale x 8 x double> %val0, i64 0)
+ %v1 = call <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double> %v0, <vscale x 8 x double> %val1, i64 8)
+ call void @llvm.masked.scatter.nxv16f64.nxv16p0(<vscale x 16 x double> %v1, <vscale x 16 x ptr> %p1, i32 8, <vscale x 16 x i1> %m)
+ ret void
+}
+
+define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i64_nxv16i64(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_nxv8i64_nxv16i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v16, v8, v8
+; CHECK-NEXT: vrgather.vv v8, v0, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v24, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vadd.vi v8, v16, 1
+; CHECK-NEXT: vrgather.vv v16, v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v24, v0, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v28, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v20, v8
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %vec = load <vscale x 16 x i64>, ptr %p
+ %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+ ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
+}
+
+
+define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x half> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT: vfmacc.vv v8, v16, v24
+; CHECK-NEXT: ret
+ %vd = call <vscale x 32 x half> @llvm.fma.v32f16(<vscale x 32 x half> %vc, <vscale x 32 x half> %vb, <vscale x 32 x half> %va)
+ ret <vscale x 32 x half> %vd
+}
+
+
+define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a2, a0, a1
+; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v28, v16, v0.t
+; CHECK-NEXT: bltu a0, a1, .LBB22_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsext.vf4 v16, v8
+; CHECK-NEXT: vsll.vi v24, v16, 3
+; CHECK-NEXT: vsext.vf4 v16, v10
+; CHECK-NEXT: vsll.vi v16, v16, 3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a4, a2, 3
+; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a4
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vluxei64.v v16, (a0), v16, v0.t
+; CHECK-NEXT: bltu a1, a2, .LBB23_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vluxei64.v v8, (a0), v24, v0.t
+; CHECK-NEXT: ret
+ %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i16> %idxs
+ %v = call <vscale x 16 x double> @llvm.vp.gather.nxv16f64.nxv16p0(<vscale x 16 x ptr> %ptrs, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x double> %v
+}
+
+
+define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a1, a3, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re32.v v8, (a1)
+; CHECK-NEXT: slli a1, a3, 1
+; CHECK-NEXT: sub a4, a2, a1
+; CHECK-NEXT: sltu a5, a2, a4
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a4, a5, a4
+; CHECK-NEXT: srli a3, a3, 2
+; CHECK-NEXT: vl8re32.v v0, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a3
+; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: bltu a2, a1, .LBB24_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB24_2:
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 %evl)
+ ret <vscale x 32 x i32> %v
+}
+
+define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <vscale x 4 x i64>* %y) {
+; CHECK-LABEL: illegal_preserve_vl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v12, v12, v12
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %index = add <vscale x 4 x i64> %x, %x
+ store <vscale x 4 x i64> %index, <vscale x 4 x i64>* %y
+ %elt = extractelement <vscale x 2 x i32> %a, i64 0
+ ret i32 %elt
+}
+
+
+define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsitofp_nxv32f16_nxv32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a2, a0, a1
+; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfncvt.f.x.w v28, v16, v0.t
+; CHECK-NEXT: bltu a0, a1, .LBB26_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB26_2:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vfncvt.f.x.w v24, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <4 x float> @tail_vmv_v_i_treat_as_vmv_s_x(<8 x float> %x, <8 x float> %y) optsize {
+; CHECK-LABEL: tail_vmv_v_i_treat_as_vmv_s_x:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vid.v v12
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vmul.vx v14, v12, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vadd.vi v8, v14, -14
+; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
+; CHECK-NEXT: vmv1r.v v8, v12
+; CHECK-NEXT: ret
+ %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
+ ret <4 x float> %z
+}
+
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
+declare <11 x i64> @llvm.vp.and.v11i64(<11 x i64>, <11 x i64>, <11 x i1>, i32)
+declare <vscale x 16 x double> @llvm.vp.gather.nxv16f64.nxv16p0(<vscale x 16 x ptr>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i32> @llvm.riscv.vle.nxv16i32.i64(<vscale x 16 x i32>, ptr nocapture, i64)
+declare <vscale x 8 x i32> @llvm.riscv.vle.nxv8i32.i64(<vscale x 8 x i32>, ptr nocapture, i64)
+declare <vscale x 16 x i32> @llvm.riscv.vadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.riscv.vadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, i64)
+declare void @llvm.riscv.vse.nxv8i32.i64(<vscale x 8 x i32>, ptr nocapture, i64)
+declare void @llvm.riscv.vse.nxv16i32.i64(<vscale x 16 x i32>, ptr nocapture, i64)
+declare <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1>, <vscale x 32 x i32>, <vscale x 32 x i32>, i32)
+declare <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x half> @llvm.fma.v32f16(<vscale x 32 x half>, <vscale x 32 x half>, <vscale x 32 x half>)
+declare void @llvm.masked.scatter.nxv16f64.nxv16p0(<vscale x 16 x double>, <vscale x 16 x ptr>, i32, <vscale x 16 x i1>)
+declare <vscale x 16 x double> @llvm.vector.insert.nxv8f64.nxv16f64(<vscale x 16 x double>, <vscale x 8 x double>, i64)
+declare <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr>, <vscale x 8 x ptr>, i64)
+declare <32 x i64> @llvm.vp.select.v32i64(<32 x i1>, <32 x i64>, <32 x i64>, i32)
+declare <2 x i32> @llvm.masked.load.v2i32(ptr, i32, <2 x i1>, <2 x i32>)
+declare <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64>, <128 x i1>, i32)
+declare <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64>, <15 x i1>, i32)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.bitreverse.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 7 x i64> @llvm.vp.bitreverse.nxv7i64(<vscale x 7 x i64>, <vscale x 7 x i1>, i32)
+declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, metadata, metadata)
>From 6c029e13f7657b136be33bb9f51ea1af479c5214 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Wed, 24 Jan 2024 22:47:58 -0800
Subject: [PATCH 02/19] [RISCV] postRA vsetvl insertion pass
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 345 ++++--
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 11 +-
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 1029 ++++++++++-------
3 files changed, 872 insertions(+), 513 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index b5fd508fa77de2..523301022afef9 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -51,6 +51,44 @@ static cl::opt<bool> UseStrictAsserts(
namespace {
+// For the SSA form, we could just use the getVRegDef to take Reaching
+// definition. For the non-SSA, we retrieve reaching definition for specific
+// register from LiveInterval/VNInfo.
+template <typename T>
+static T *getReachingDefMI(Register Reg, T *MI, const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
+ if (MRI->isSSA())
+ return MRI->getVRegDef(Reg);
+
+ if (!MI)
+ return MRI->getUniqueVRegDef(Reg);
+
+ // For O0 situation
+ if (!LIS)
+ return nullptr;
+
+ // If MI is DefMI
+ if (llvm::any_of(MI->defs(), [Reg](const MachineOperand MO) {
+ return MO.isReg() && MO.getReg() == Reg;
+ }))
+ return MI;
+
+ if (Reg.isVirtual() && LIS->hasInterval(Reg)) {
+ auto &LI = LIS->getInterval(Reg);
+ SlotIndexes *SIs = LIS->getSlotIndexes();
+ SlotIndex SI = SIs->getInstructionIndex(*MI);
+ VNInfo *Valno = LI.getVNInfoBefore(SI);
+ if (!Valno || Valno->isPHIDef())
+ return nullptr;
+ MachineInstr *DefMI = SIs->getInstructionFromIndex(Valno->def);
+ return DefMI;
+ }
+
+ // TODO: Handle physical register
+
+ return nullptr;
+}
+
static unsigned getVLOpNum(const MachineInstr &MI) {
return RISCVII::getVLOpNum(MI.getDesc());
}
@@ -178,7 +216,8 @@ static bool isMaskRegOp(const MachineInstr &MI) {
/// specification. Agnostic requires each lane to either be undisturbed, or
/// take the value -1; no other value is allowed.
static bool hasUndefinedMergeOp(const MachineInstr &MI,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
unsigned UseOpIdx;
if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -198,13 +237,15 @@ static bool hasUndefinedMergeOp(const MachineInstr &MI,
if (UseMO.getReg().isPhysical())
return false;
- if (MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg())) {
+ if (const MachineInstr *UseMI =
+ getReachingDefMI(UseMO.getReg(), &MI, &MRI, LIS)) {
if (UseMI->isImplicitDef())
return true;
if (UseMI->isRegSequence()) {
for (unsigned i = 1, e = UseMI->getNumOperands(); i < e; i += 2) {
- MachineInstr *SourceMI = MRI.getVRegDef(UseMI->getOperand(i).getReg());
+ const MachineInstr *SourceMI =
+ getReachingDefMI(UseMI->getOperand(i).getReg(), UseMI, &MRI, LIS);
if (!SourceMI || !SourceMI->isImplicitDef())
return false;
}
@@ -366,7 +407,7 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
/// Return the fields and properties demanded by the provided instruction.
DemandedFields getDemanded(const MachineInstr &MI,
const MachineRegisterInfo *MRI,
- const RISCVSubtarget *ST) {
+ const RISCVSubtarget *ST, const LiveIntervals *LIS) {
// Warning: This function has to work on both the lowered (i.e. post
// emitVSETVLIs) and pre-lowering forms. The main implication of this is
// that it can't use the value of a SEW, VL, or Policy operand as they might
@@ -430,7 +471,7 @@ DemandedFields getDemanded(const MachineInstr &MI,
// this for any tail agnostic operation, but we can't as TA requires
// tail lanes to either be the original value or -1. We are writing
// unknown bits to the lanes here.
- if (hasUndefinedMergeOp(MI, *MRI)) {
+ if (hasUndefinedMergeOp(MI, *MRI, LIS)) {
if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
else
@@ -649,7 +690,8 @@ class VSETVLIInfo {
// Require are compatible with the previous vsetvli instruction represented
// by this. MI is the instruction whose requirements we're considering.
bool isCompatible(const DemandedFields &Used, const VSETVLIInfo &Require,
- const MachineRegisterInfo &MRI) const {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) const {
assert(isValid() && Require.isValid() &&
"Can't compare invalid VSETVLIInfos");
assert(!Require.SEWLMULRatioOnly &&
@@ -794,6 +836,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
const RISCVSubtarget *ST;
const TargetInstrInfo *TII;
MachineRegisterInfo *MRI;
+ LiveIntervals *LIS = nullptr;
std::vector<BlockData> BlockInfo;
std::queue<const MachineBasicBlock *> WorkList;
@@ -806,6 +849,15 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+
+ AU.addUsedIfAvailable<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addUsedIfAvailable<SlotIndexes>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addUsedIfAvailable<LiveDebugVariables>();
+ AU.addPreserved<LiveDebugVariables>();
+ AU.addPreserved<LiveStacks>();
+
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -814,8 +866,8 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
private:
bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
const VSETVLIInfo &CurInfo) const;
- bool needVSETVLIPHI(const VSETVLIInfo &Require,
- const MachineBasicBlock &MBB) const;
+ bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB,
+ const MachineInstr &MI) const;
void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
void insertVSETVLI(MachineBasicBlock &MBB,
@@ -909,12 +961,13 @@ static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
const RISCVSubtarget &ST,
- const MachineRegisterInfo *MRI) {
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
VSETVLIInfo InstrInfo;
bool TailAgnostic = true;
bool MaskAgnostic = true;
- if (!hasUndefinedMergeOp(MI, *MRI)) {
+ if (!hasUndefinedMergeOp(MI, *MRI, LIS)) {
// Start with undisturbed.
TailAgnostic = false;
MaskAgnostic = false;
@@ -1002,6 +1055,62 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
insertVSETVLI(MBB, MachineBasicBlock::iterator(&MI), DL, Info, PrevInfo);
}
+static void fixupModifyVRegLI(Register VReg, LiveIntervals *LIS) {
+ if (!LIS)
+ return;
+
+ if (LIS->hasInterval(VReg))
+ LIS->removeInterval(VReg);
+ LIS->createAndComputeVirtRegInterval(VReg);
+
+ // After needVSETVLIPHI, may raise
+ // "Multiple connected components in live interval"
+ // error.
+ auto &LI = LIS->getInterval(VReg);
+ SmallVector<LiveInterval *, 8> SplitLIs;
+ LIS->splitSeparateComponents(LI, SplitLIs);
+}
+
+static void getVRegFromMI(MachineInstr *MI, SmallVector<Register> &VRegs) {
+ for (auto &MO : MI->operands()) {
+ if (!MO.isReg() || MO.getReg() == 0 || !MO.getReg().isVirtual())
+ continue;
+ Register Reg = MO.getReg();
+ VRegs.push_back(Reg);
+ }
+}
+
+static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
+ LiveIntervals *LIS) {
+
+ if (!LIS)
+ return;
+
+ if (LIS->isNotInMIMap(*MI))
+ LIS->InsertMachineInstrInMaps(*MI);
+
+ SmallVector<Register> NeedFixupVReg;
+ getVRegFromMI(MI, NeedFixupVReg);
+
+ for (auto VReg : NeedFixupVReg)
+ fixupModifyVRegLI(VReg, LIS);
+}
+
+static void removeMIAndFixupModifyVRegLI(MachineInstr *MI, LiveIntervals *LIS) {
+
+ SmallVector<Register> NeedFixupVReg;
+ getVRegFromMI(MI, NeedFixupVReg);
+
+ MI->eraseFromParent();
+
+ if (!LIS)
+ return;
+
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ for (auto VReg : NeedFixupVReg)
+ fixupModifyVRegLI(VReg, LIS);
+}
+
void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc DL,
const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
@@ -1011,11 +1120,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
// VLMAX.
if (Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1039,10 +1150,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
}
if (Info.hasAVLImm()) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addImm(Info.getAVLImm())
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(Info.getAVLImm())
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1051,18 +1164,22 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// the previous vl to become invalid.
if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
Info.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
// Otherwise use an AVL of 1 to avoid depending on previous vl.
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addImm(1)
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addImm(1)
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1077,10 +1194,11 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
Register AVLReg = Info.getAVLReg();
MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(AVLReg)
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLI))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(AVLReg)
+ .addImm(Info.encodeVTYPE());
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
}
static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
@@ -1093,12 +1211,13 @@ static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
const VSETVLIInfo &Require,
const VSETVLIInfo &CurInfo) const {
- assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, MRI));
+ assert(Require ==
+ computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, MRI, LIS));
if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
return true;
- DemandedFields Used = getDemanded(MI, MRI, ST);
+ DemandedFields Used = getDemanded(MI, MRI, ST, LIS);
// A slidedown/slideup with an *undefined* merge op can freely clobber
// elements not copied from the source vector (e.g. masked off, tail, or
@@ -1109,7 +1228,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// * The LMUL1 restriction is for machines whose latency may depend on VL.
// * As above, this is only legal for tail "undefined" not "agnostic".
if (isVSlideInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
- isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+ isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+ hasUndefinedMergeOp(MI, *MRI, LIS)) {
Used.VLAny = false;
Used.VLZeroness = true;
Used.LMUL = false;
@@ -1121,8 +1241,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place.
// Since a splat is non-constant time in LMUL, we do need to be careful to not
// increase the number of active vector registers (unlike for vmv.s.x.)
- if (isScalarSplatInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
- isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+ if (isScalarSplatInstr(MI) && Require.hasAVLImm() &&
+ Require.getAVLImm() == 1 && isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+ hasUndefinedMergeOp(MI, *MRI, LIS)) {
Used.LMUL = false;
Used.SEWLMULRatio = false;
Used.VLAny = false;
@@ -1133,7 +1254,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
Used.TailPolicy = false;
}
- if (CurInfo.isCompatible(Used, Require, *MRI))
+ if (CurInfo.isCompatible(Used, Require, *MRI, LIS))
return false;
// We didn't find a compatible value. If our AVL is a virtual register,
@@ -1156,7 +1277,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
// places.
static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
- DemandedFields &Demanded) {
+ DemandedFields &Demanded,
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
VSETVLIInfo Info = NewInfo;
if (!Demanded.LMUL && !Demanded.SEWLMULRatio && PrevInfo.isValid() &&
@@ -1179,7 +1302,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
if (!RISCVII::hasSEWOp(TSFlags))
return;
- const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, MRI);
+ const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, MRI, LIS);
assert(NewInfo.isValid() && !NewInfo.isUnknown());
if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
return;
@@ -1188,8 +1311,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
if (!Info.isValid() || Info.isUnknown())
Info = NewInfo;
- DemandedFields Demanded = getDemanded(MI, MRI, ST);
- const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded);
+ DemandedFields Demanded = getDemanded(MI, MRI, ST, LIS);
+ const VSETVLIInfo IncomingInfo =
+ adjustIncoming(PrevInfo, NewInfo, Demanded, MRI, LIS);
// If MI only demands that VL has the same zeroness, we only need to set the
// AVL if the zeroness differs. This removes a vsetvli entirely if the types
@@ -1323,46 +1447,81 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
// be unneeded if the AVL is a phi node where all incoming values are VL
// outputs from the last VSETVLI in their respective basic blocks.
bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
- const MachineBasicBlock &MBB) const {
+ const MachineBasicBlock &MBB,
+ const MachineInstr &MI) const {
if (DisableInsertVSETVLPHIOpt)
return true;
if (!Require.hasAVLReg())
return true;
- // We need the AVL to be produce by a PHI node in this basic block.
- const MachineInstr *PHI = &Require.getAVLDefMI();
- if (PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
- return true;
-
- for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
- PHIOp += 2) {
- Register InReg = PHI->getOperand(PHIOp).getReg();
- MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
- const VSETVLIInfo &PBBExit = BlockInfo[PBB->getNumber()].Exit;
+ if (!MRI->isSSA()) {
- // We need the PHI input to the be the output of a VSET(I)VLI.
- MachineInstr *DefMI = MRI->getVRegDef(InReg);
- if (!DefMI || !isVectorConfigInstr(*DefMI))
+ // For O0
+ if (!LIS)
return true;
- // We found a VSET(I)VLI make sure it matches the output of the
- // predecessor block.
- VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
- if (DefInfo != PBBExit)
+ LiveRange &LR = LIS->getInterval(Require.getAVLReg());
+ SlotIndexes *SIs = LIS->getSlotIndexes();
+ SlotIndex SI = SIs->getInstructionIndex(MI);
+ VNInfo *Valno = LR.getVNInfoAt(SI);
+ if (!Valno || !Valno->isPHIDef())
return true;
- // Require has the same VL as PBBExit, so if the exit from the
- // predecessor has the VTYPE we are looking for we might be able
- // to avoid a VSETVLI.
- if (PBBExit.isUnknown() || !PBBExit.hasSameVTYPE(Require))
+ for (auto *PredMBB : MBB.predecessors()) {
+ const BlockData &PBBInfo = BlockInfo[PredMBB->getNumber()];
+ if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+ return true;
+
+ const VNInfo *Value = LR.getVNInfoBefore(LIS->getMBBEndIdx(PredMBB));
+ if (!Value)
+ return true;
+
+ // TODO: DefMI is COPY in most case, maybe we should search
+ // until encouter non-COPY node.
+ MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def);
+ if (!DefMI || !isVectorConfigInstr(*DefMI))
+ return true;
+
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+ if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
+ !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+ return true;
+ }
+ } else {
+ // We need the AVL to be produce by a PHI node in this basic block.
+ const MachineInstr *PHI = &Require.getAVLDefMI();
+ if (PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
return true;
- }
+
+ for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
+ PHIOp += 2) {
+ Register InReg = PHI->getOperand(PHIOp).getReg();
+ MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
+ const VSETVLIInfo &PBBExit = BlockInfo[PBB->getNumber()].Exit;
+
+ // We need the PHI input to the be the output of a VSET(I)VLI.
+ MachineInstr *DefMI = MRI->getVRegDef(InReg);
+ if (!DefMI || !isVectorConfigInstr(*DefMI))
+ return true;
+
+ // We found a VSET(I)VLI make sure it matches the output of the
+ // predecessor block.
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+ if (DefInfo != PBBExit)
+ return true;
+
+ // Require has the same VL as PBBExit, so if the exit from the
+ // predecessor has the VTYPE we are looking for we might be able
+ // to avoid a VSETVLI.
+ if (PBBExit.isUnknown() || !PBBExit.hasSameVTYPE(Require))
+ return true;
+ }
// If all the incoming values to the PHI checked out, we don't need
// to insert a VSETVLI.
return false;
-}
+ }
void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
VSETVLIInfo CurInfo = BlockInfo[MBB.getNumber()].Pred;
@@ -1394,7 +1553,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// wouldn't be used and VL/VTYPE registers are correct. Note that
// we *do* need to model the state as if it changed as while the
// register contents are unchanged, the abstract model can change.
- if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB))
+ if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB, MI))
insertVSETVLI(MBB, MI, CurInfo, PrevInfo);
PrefixTransparent = false;
}
@@ -1403,9 +1562,11 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
if (VLOp.isReg()) {
Register Reg = VLOp.getReg();
- MachineInstr *VLOpDef = MRI->getVRegDef(Reg);
+ MachineInstr *VLOpDef = getReachingDefMI(Reg, &MI, MRI, LIS);
// Erase the AVL operand from the instruction.
+ Register VLOpReg = VLOp.getReg();
+ bool IsVirtVLOpReg = VLOp.getReg().isVirtual();
VLOp.setReg(RISCV::NoRegister);
VLOp.setIsKill(false);
@@ -1415,7 +1576,9 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// dead now.
if (VLOpDef && TII->isAddImmediate(*VLOpDef, Reg) &&
MRI->use_nodbg_empty(Reg))
- VLOpDef->eraseFromParent();
+ removeMIAndFixupModifyVRegLI(VLOpDef, LIS);
+ if (IsVirtVLOpReg)
+ fixupModifyVRegLI(VLOpReg, LIS);
}
MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false,
/*isImp*/ true));
@@ -1567,10 +1730,10 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// Return true if we can mutate PrevMI to match MI without changing any the
// fields which would be observed.
-static bool canMutatePriorConfig(const MachineInstr &PrevMI,
- const MachineInstr &MI,
- const DemandedFields &Used,
- const MachineRegisterInfo &MRI) {
+static bool canMutatePriorConfig(
+ const MachineInstr &PrevMI, const MachineInstr &MI,
+ const DemandedFields &Used, const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
// If the VL values aren't equal, return false if either a) the former is
// demanded, or b) we can't rewrite the former to be the later for
// implementation reasons.
@@ -1611,6 +1774,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
Used.demandVL();
Used.demandVTYPE();
SmallVector<MachineInstr*> ToDelete;
+ SmallVector<MachineInstr *> MIInBetween;
for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
if (!isVectorConfigInstr(MI)) {
@@ -1628,13 +1792,31 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
Used.demandVL();
if (NextMI) {
+
+ // A tail undefined vmv.v.i/x or vfmv.v.f with VL=1 can be treated in the
+ // same semantically as vmv.s.x.
+ if (MIInBetween.size() == 1 && isScalarSplatInstr(*MIInBetween[0]) &&
+ MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 1 &&
+ isLMUL1OrSmaller(RISCVVType::getVLMUL(MI.getOperand(2).getImm())) &&
+ hasUndefinedMergeOp(*MIInBetween[0], *MRI, LIS)) {
+ Used.LMUL = false;
+ Used.SEWLMULRatio = false;
+ Used.VLAny = false;
+ if (isFloatScalarMoveOrScalarSplatInstr(*MIInBetween[0]) &&
+ !ST->hasVInstructionsF64())
+ Used.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
+ else
+ Used.SEW = DemandedFields::SEWGreaterThanOrEqual;
+ Used.TailPolicy = false;
+ }
+
if (!Used.usedVL() && !Used.usedVTYPE()) {
ToDelete.push_back(&MI);
// Leave NextMI unchanged
continue;
}
- if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
+ if (canMutatePriorConfig(MI, *NextMI, Used, *MRI, LIS)) {
if (!isVLPreservingConfig(*NextMI)) {
Register DefReg = NextMI->getOperand(0).getReg();
@@ -1689,7 +1871,8 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
}
}
NextMI = &MI;
- Used = getDemanded(MI, MRI, ST);
+ Used = getDemanded(MI, MRI, ST, LIS);
+ MIInBetween.clear();
}
NumCoalescedVSETVL += ToDelete.size();
@@ -1706,11 +1889,16 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
MachineInstr &MI = *I++;
if (RISCV::isFaultFirstLoad(MI)) {
Register VLOutput = MI.getOperand(1).getReg();
- if (!MRI->use_nodbg_empty(VLOutput))
- BuildMI(MBB, I, MI.getDebugLoc(), TII->get(RISCV::PseudoReadVL),
- VLOutput);
+ bool IsVirtual = MI.getOperand(1).getReg().isVirtual();
+ if (!MRI->use_nodbg_empty(VLOutput)) {
+ auto NeedFixupMI = BuildMI(MBB, I, MI.getDebugLoc(),
+ TII->get(RISCV::PseudoReadVL), VLOutput);
+ fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ }
// We don't use the vl output of the VLEFF/VLSEGFF anymore.
MI.getOperand(1).setReg(RISCV::X0);
+ if (IsVirtual)
+ fixupModifyVRegLI(VLOutput, LIS);
}
}
}
@@ -1725,6 +1913,7 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
assert(BlockInfo.empty() && "Expect empty block infos");
BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0876f46728a10c..a99db1810295b7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -96,6 +96,10 @@ static cl::opt<bool> EnableMISchedLoadClustering(
cl::desc("Enable load clustering in the machine scheduler"),
cl::init(false));
+static cl::opt<bool> EnableVSETVLIAfterRVVRegAlloc(
+ "riscv-vsetvli-after-rvv-regalloc", cl::Hidden,
+ cl::desc("vsetvl insertion after rvv regalloc"), cl::init(false));
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -389,6 +393,8 @@ FunctionPass *RISCVPassConfig::createRVVRegAllocPass(bool Optimized) {
bool RISCVPassConfig::addRegAssignAndRewriteFast() {
addPass(createRVVRegAllocPass(false));
+ if (EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
addPass(createRISCVCoalesceVSETVLIPass());
return TargetPassConfig::addRegAssignAndRewriteFast();
}
@@ -396,6 +402,8 @@ bool RISCVPassConfig::addRegAssignAndRewriteFast() {
bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
addPass(createRVVRegAllocPass(true));
addPass(createVirtRegRewriter(false));
+ if (EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
addPass(createRISCVCoalesceVSETVLIPass());
return TargetPassConfig::addRegAssignAndRewriteOptimized();
}
@@ -535,7 +543,8 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None)
addPass(createRISCVMergeBaseOffsetOptPass());
- addPass(createRISCVInsertVSETVLIPass());
+ if (!EnableSplitRegAlloc || !EnableVSETVLIAfterRVVRegAlloc)
+ addPass(createRISCVInsertVSETVLIPass());
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableRISCVDeadRegisterElimination)
addPass(createRISCVDeadRegisterDefinitionsPass());
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
index 8204cec7e27794..65270ea3a40eee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b -target-abi=lp64d \
-; RUN: --riscv-split-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: --riscv-split-regalloc=1 -riscv-vsetvli-after-rvv-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb) nounwind strictfp {
; CHECK-LABEL: fcmp_ole_vv_nxv2f16:
@@ -32,41 +32,39 @@ define dso_local void @test_interleave_cause_spill(ptr nocapture noundef %in) lo
; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 12
; CHECK-NEXT: vle32.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 16
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: addi a1, a0, 20
; CHECK-NEXT: vle32.v v4, (a1)
+; CHECK-NEXT: addi a1, a0, 20
+; CHECK-NEXT: vle32.v v0, (a1)
; CHECK-NEXT: addi a1, a0, 24
; CHECK-NEXT: vle32.v v16, (a1)
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vadd.vv v24, v8, v24
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vadd.vv v20, v0, v4
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vadd.vv v8, v8, v24
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vadd.vv v16, v0, v16
+; CHECK-NEXT: vadd.vv v20, v4, v0
+; CHECK-NEXT: vadd.vv v16, v4, v16
; CHECK-NEXT: addi a1, a0, 40
; CHECK-NEXT: vse32.v v20, (a1)
; CHECK-NEXT: addi a1, a0, 44
; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 48
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: addi a0, a0, 52
; CHECK-NEXT: vse32.v v8, (a0)
@@ -321,18 +319,19 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; CHECK-LABEL: constant_folding_crash:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld a0, 8(a0)
-; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: seqz a0, a0
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v13, a0
-; CHECK-NEXT: vmsne.vi v0, v13, 0
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vmsne.vi v12, v12, 0
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmv1r.v v0, v13
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vrgather.vi v9, v8, 0
; CHECK-NEXT: vmsne.vi v0, v9, 0
@@ -411,13 +410,13 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-NEXT: sltu a0, a0, a1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: and a0, a0, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
; CHECK-NEXT: vand.vx v16, v16, a2, v0.t
; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t
@@ -532,13 +531,14 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 5
; CHECK-NEXT: vmv.s.x v10, a2
+; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v12, (a0)
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 6
; CHECK-NEXT: vmv.s.x v10, a3
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
-; CHECK-NEXT: vle32.v v12, (a0)
; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vadd.vv v8, v8, v12
; CHECK-NEXT: ret
@@ -569,14 +569,14 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
; CHECK-LABEL: insert_v8i32_v2i32_2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a1)
; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vslideup.vi v8, v10, 2
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vse32.v v10, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
%vec = load <8 x i32>, ptr %vp
@@ -592,13 +592,14 @@ define void @buildvec_seq_v9i8(ptr %x) {
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 3
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vmv.v.i v9, 3
; CHECK-NEXT: li a1, 146
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v8, v8, 2, v0
+; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v8, v9, 2, v0
; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -626,373 +627,422 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 52
+; CHECK-NEXT: li a3, 74
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb
; CHECK-NEXT: addi a2, a1, 256
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 27
+; CHECK-NEXT: li a3, 25
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: addi a2, a1, 128
-; CHECK-NEXT: vle64.v v8, (a2)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 35
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a3, a1, 6
+; CHECK-NEXT: add a1, a3, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT: vrgather.vi v8, v16, 4
+; CHECK-NEXT: vrgather.vi v12, v16, 4
; CHECK-NEXT: li a1, 128
-; CHECK-NEXT: vmv.s.x v4, a1
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v16, 8
+; CHECK-NEXT: vslidedown.vi v16, v16, 8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: li a3, 49
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vrgather.vi v8, v24, 2, v0.t
-; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vrgather.vi v12, v16, 2, v0.t
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vmul.vx v2, v8, a1
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmul.vx v2, v10, a1
+; CHECK-NEXT: li a1, 56
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 57
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv.s.x v7, a1
+; CHECK-NEXT: vadd.vi v10, v2, -16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v2
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 56
-; CHECK-NEXT: vmv.s.x v1, a1
-; CHECK-NEXT: vadd.vi v16, v2, -16
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v2
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v10, v0.t
; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v20, v8
+; CHECK-NEXT: vmv.v.v v12, v16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 4
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 21
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 5
-; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v12, v16, 5
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmv1r.v v6, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 49
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 3, v0.t
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vrgather.vi v12, v16, 3, v0.t
+; CHECK-NEXT: vmv.v.v v28, v12
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs2r.v v2, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vadd.vi v24, v2, 1
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v26, v2, -15
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v24, v2, -15
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs2r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v26, v0.t
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v28, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: lui a1, 16
+; CHECK-NEXT: addi a1, a1, 7
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 6
+; CHECK-NEXT: vmv.v.x v10, a1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v16, v9
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v2, v0.t
-; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgatherei16.vv v12, v16, v10
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 11
+; CHECK-NEXT: li a2, 41
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl2r.v v2, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vi v4, v2, 2
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: vmv.s.x v4, a1
-; CHECK-NEXT: vadd.vi v16, v2, -14
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v12, 6
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: vrgather.vi v12, v16, 2
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 37
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v24, v16
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v12, v16, 3
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: vmv.s.x v1, a1
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v24, v2, 2
+; CHECK-NEXT: vadd.vi v4, v2, -14
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: slli a2, a1, 6
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v24
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v16, v24, 4, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v4, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 3
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 25
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v28, v2, 3
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: li a2, 49
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v28
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v16, v2, -13
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: lui a1, 16
-; CHECK-NEXT: addi a1, a1, 7
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v20, v16, 4, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 45
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v4, v2, 3
+; CHECK-NEXT: vadd.vi v8, v2, -13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v24, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v8, v16, v4
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
-; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v24, v16, 5, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v24, v8
+; CHECK-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: slli a2, a1, 3
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 49
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 41
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgather.vi v8, v24, 5, v0.t
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 41
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: lui a1, 96
+; CHECK-NEXT: li a2, 192
+; CHECK-NEXT: vmv.s.x v28, a2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v8, a1
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT: li a1, 192
-; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vmv1r.v v0, v28
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: li a2, 37
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
+; CHECK-NEXT: li a2, 37
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v4, v24, 2
-; CHECK-NEXT: vrgatherei16.vv v4, v16, v8, v0.t
+; CHECK-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 28
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v26, v2, 4
-; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v30, v2, 4
+; CHECK-NEXT: vadd.vi v6, v2, -12
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v26
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: li a1, 28
-; CHECK-NEXT: vmv.s.x v1, a1
-; CHECK-NEXT: vadd.vi v16, v2, -12
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v30
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v4, v8
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v6, v0.t
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: lui a1, 112
; CHECK-NEXT: addi a1, a1, 1
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vmv1r.v v0, v28
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 27
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vi v8, v16, 3
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: li a2, 45
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 19
+; CHECK-NEXT: li a2, 25
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; CHECK-NEXT: vmv.v.v v16, v24
+; CHECK-NEXT: vmv2r.v v8, v2
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vadd.vi v12, v2, 5
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 6
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v24, v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vadd.vi v2, v8, -11
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 43
+; CHECK-NEXT: li a2, 57
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vadd.vi v12, v2, -11
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: vrgatherei16.vv v24, v8, v2, v0.t
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 35
+; CHECK-NEXT: li a2, 41
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
+; CHECK-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: vmv.v.v v12, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 37
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv.v.v v20, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv.v.v v8, v24
; CHECK-NEXT: addi a1, a0, 320
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vse64.v v4, (a1)
+; CHECK-NEXT: vse64.v v20, (a1)
; CHECK-NEXT: addi a1, a0, 192
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a3, a2, 1
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: vse64.v v12, (a1)
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vse64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 64
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a3, a2, 3
-; CHECK-NEXT: sub a2, a3, a2
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vse64.v v8, (a1)
-; CHECK-NEXT: addi a1, a0, 64
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 11
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 4
+; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 4
-; CHECK-NEXT: sub a1, a2, a1
+; CHECK-NEXT: li a2, 21
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 52
+; CHECK-NEXT: li a1, 74
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
@@ -1020,110 +1070,114 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 56
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a2, a2, 6
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; CHECK-NEXT: vmv1r.v v4, v0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 5
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v1, v0, 8
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v2, v0, 4
-; CHECK-NEXT: addi a2, a1, 512
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v29, v0, 8
+; CHECK-NEXT: addi a2, a1, 512
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v27, v1, 4
-; CHECK-NEXT: addi a2, a1, 640
+; CHECK-NEXT: vslidedown.vi v27, v29, 4
+; CHECK-NEXT: addi a3, a1, 640
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: vle64.v v8, (a3)
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: addi a2, a7, -64
-; CHECK-NEXT: sltu a3, a7, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a4, a3, a2
-; CHECK-NEXT: addi a2, a4, -32
-; CHECK-NEXT: sltu a3, a4, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a3, a3, a2
-; CHECK-NEXT: addi a2, a3, -16
-; CHECK-NEXT: sltu a5, a3, a2
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a2, a5, a2
; CHECK-NEXT: vslidedown.vi v0, v27, 2
-; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: addi a3, a7, -64
+; CHECK-NEXT: sltu a4, a7, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a4, a4, a3
+; CHECK-NEXT: addi a3, a4, -32
+; CHECK-NEXT: sltu a5, a4, a3
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a3, a5, a3
+; CHECK-NEXT: addi a5, a3, -16
+; CHECK-NEXT: sltu a6, a3, a5
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a5, a6, a5
+; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 24
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a5, a1, 128
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v26, v7, 4
; CHECK-NEXT: bltu a3, a2, .LBB14_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
; CHECK-NEXT: .LBB14_2:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v12, v2, 2
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a5)
-; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT: li a3, 64
-; CHECK-NEXT: vmv1r.v v0, v27
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 40
-; CHECK-NEXT: mul a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT: vle64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: li a6, 48
; CHECK-NEXT: mul a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a7, a3, .LBB14_4
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v28, v26, 2
+; CHECK-NEXT: li a5, 64
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a6, 56
+; CHECK-NEXT: mul a3, a3, a6
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: bltu a7, a5, .LBB14_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: li a7, 64
+; CHECK-NEXT: li a6, 64
; CHECK-NEXT: .LBB14_4:
; CHECK-NEXT: addi a5, a1, 384
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: addi a6, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a6, a7, -32
-; CHECK-NEXT: sltu t0, a7, a6
-; CHECK-NEXT: addi t0, t0, -1
-; CHECK-NEXT: and a6, t0, a6
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi t0, a6, -32
+; CHECK-NEXT: sltu a6, a6, t0
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a6, a6, t0
; CHECK-NEXT: addi t0, a6, -16
; CHECK-NEXT: sltu t1, a6, t0
; CHECK-NEXT: addi t1, t1, -1
; CHECK-NEXT: and t0, t1, t0
+; CHECK-NEXT: vmv1r.v v0, v28
+; CHECK-NEXT: csrr t1, vlenb
+; CHECK-NEXT: li t2, 48
+; CHECK-NEXT: mul t1, t1, t2
+; CHECK-NEXT: add t1, sp, t1
+; CHECK-NEXT: addi t1, t1, 16
+; CHECK-NEXT: vl8r.v v16, (t1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
; CHECK-NEXT: csrr t0, vlenb
-; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: slli t0, t0, 4
; CHECK-NEXT: add t0, sp, t0
; CHECK-NEXT: addi t0, t0, 16
; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
@@ -1131,131 +1185,143 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: li a6, 16
; CHECK-NEXT: .LBB14_6:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v3, v1, 2
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a1, 256
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v27, v29, 2
+; CHECK-NEXT: vmv1r.v v0, v26
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 3
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v2
-; CHECK-NEXT: addi a5, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: li a6, 48
; CHECK-NEXT: mul a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a5, a4
; CHECK-NEXT: bltu a4, a3, .LBB14_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a4, 32
+; CHECK-NEXT: li a5, 32
; CHECK-NEXT: .LBB14_8:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a1, a4, -16
-; CHECK-NEXT: sltu a5, a4, a1
+; CHECK-NEXT: addi a1, a5, -16
+; CHECK-NEXT: sltu a5, a5, a1
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a1, a5, a1
+; CHECK-NEXT: vmv1r.v v0, v27
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v3
; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
; CHECK-NEXT: bltu a4, a2, .LBB14_10
; CHECK-NEXT: # %bb.9:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: .LBB14_10:
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v5, v4, 2
+; CHECK-NEXT: vslidedown.vi v6, v7, 2
+; CHECK-NEXT: vmv1r.v v0, v29
; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t
; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: mv a1, a7
; CHECK-NEXT: bltu a7, a3, .LBB14_12
; CHECK-NEXT: # %bb.11:
-; CHECK-NEXT: li a7, 32
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB14_12:
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 56
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 24
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 48
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 48
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 40
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 56
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 48
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vslideup.vi v8, v16, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 40
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 48
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; CHECK-NEXT: vmv4r.v v8, v0
; CHECK-NEXT: vslideup.vi v8, v24, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a7, -16
-; CHECK-NEXT: sltu a4, a7, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: li a5, 24
+; CHECK-NEXT: mul a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a4, a1, -16
+; CHECK-NEXT: sltu a1, a1, a4
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a1, a1, a4
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v5
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a1, a1, a4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
; CHECK-NEXT: bltu a7, a2, .LBB14_14
; CHECK-NEXT: # %bb.13:
; CHECK-NEXT: li a7, 16
; CHECK-NEXT: .LBB14_14:
-; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v4
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a2, 40
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vslideup.vi v24, v16, 16
; CHECK-NEXT: vse32.v v24, (a0)
; CHECK-NEXT: addi a1, a0, 256
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: li a3, 48
; CHECK-NEXT: mul a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
@@ -1263,15 +1329,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: addi a0, a0, 384
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 48
+; CHECK-NEXT: li a2, 56
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 56
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 6
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1382,12 +1447,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
@@ -1404,25 +1469,57 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: slli a2, a2, 5
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vl8re64.v v24, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v16, (a1)
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vsoxei64.v v8, (zero), v24, v0.t
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a0
+; CHECK-NEXT: vslidedown.vx v24, v0, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsoxei64.v v8, (zero), v16, v0.t
+; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1446,23 +1543,22 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a0, a1
-; CHECK-NEXT: vl8re64.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vadd.vv v16, v8, v8
-; CHECK-NEXT: vrgather.vv v8, v0, v16
+; CHECK-NEXT: mul a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v16, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
@@ -1470,34 +1566,47 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vrgather.vv v24, v8, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vv v8, v0, v16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vadd.vi v8, v16, 1
-; CHECK-NEXT: vrgather.vv v16, v0, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vrgather.vv v16, v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vrgather.vv v24, v0, v8
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv4r.v v28, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v28, v8
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
; CHECK-NEXT: vmv8r.v v8, v24
@@ -1527,7 +1636,15 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 2
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
@@ -1537,16 +1654,22 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v28, v16, v0.t
+; CHECK-NEXT: vfncvt.rtz.x.f.w v20, v24, v0.t
; CHECK-NEXT: bltu a0, a1, .LBB22_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v8, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i16> %v
@@ -1557,26 +1680,27 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
-; CHECK-NEXT: vsext.vf4 v16, v8
-; CHECK-NEXT: vsll.vi v24, v16, 3
; CHECK-NEXT: vsext.vf4 v16, v10
; CHECK-NEXT: vsll.vi v16, v16, 3
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: srli a4, a2, 3
; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a4
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vluxei64.v v16, (a0), v16, v0.t
+; CHECK-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsext.vf4 v24, v8
+; CHECK-NEXT: vsll.vi v24, v24, 3
; CHECK-NEXT: bltu a1, a2, .LBB23_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vluxei64.v v8, (a0), v24, v0.t
; CHECK-NEXT: ret
%ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i16> %idxs
@@ -1591,11 +1715,17 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1604,35 +1734,51 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: slli a1, a3, 3
; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vl8re32.v v8, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: slli a1, a3, 1
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: sltu a5, a2, a4
; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: vl8re32.v v0, (a0)
+; CHECK-NEXT: vl8re32.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a3
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: and a4, a5, a4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
; CHECK-NEXT: bltu a2, a1, .LBB24_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB24_2:
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1645,9 +1791,10 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; CHECK-NEXT: vadd.vv v12, v12, v12
-; CHECK-NEXT: vs4r.v v12, (a0)
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: ret
%index = add <vscale x 4 x i64> %x, %x
store <vscale x 4 x i64> %index, <vscale x 4 x i64>* %y
@@ -1659,7 +1806,15 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, <v
define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vsitofp_nxv32f16_nxv32i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 2
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
@@ -1669,16 +1824,22 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfncvt.f.x.w v28, v16, v0.t
+; CHECK-NEXT: vfncvt.f.x.w v20, v24, v0.t
; CHECK-NEXT: bltu a0, a1, .LBB26_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB26_2:
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vfncvt.f.x.w v24, v8, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vfncvt.f.x.w v16, v8, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
@@ -1693,9 +1854,9 @@ define <4 x float> @tail_vmv_v_i_treat_as_vmv_s_x(<8 x float> %x, <8 x float> %y
; CHECK-NEXT: vmul.vx v14, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vadd.vi v8, v14, -14
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vadd.vi v8, v14, -14
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v12
>From 3d53aed46869eb7a5ee554ad6a18a884dc5aadae Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 11 Mar 2024 06:58:25 -0700
Subject: [PATCH 03/19] Also fix LIS index after insert
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 523301022afef9..1b04d96ae5ea9f 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1089,6 +1089,7 @@ static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
if (LIS->isNotInMIMap(*MI))
LIS->InsertMachineInstrInMaps(*MI);
+ LIS->handleMove(*MI);
SmallVector<Register> NeedFixupVReg;
getVRegFromMI(MI, NeedFixupVReg);
>From 55623b0f0936b436c98304d6b7ff8af7601d5443 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Tue, 9 Apr 2024 23:17:57 -0700
Subject: [PATCH 04/19] Merge getVRegDef and getUniqueVRegDef inside
getReachingDefMI
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1b04d96ae5ea9f..815ef527a81072 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -57,10 +57,7 @@ namespace {
template <typename T>
static T *getReachingDefMI(Register Reg, T *MI, const MachineRegisterInfo *MRI,
const LiveIntervals *LIS) {
- if (MRI->isSSA())
- return MRI->getVRegDef(Reg);
-
- if (!MI)
+ if (MRI->isSSA() || !MI)
return MRI->getUniqueVRegDef(Reg);
// For O0 situation
>From 537eb79b33b8b743a6a1c47436ad1a86f97aaf3e Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Tue, 9 Apr 2024 23:43:13 -0700
Subject: [PATCH 05/19] Remove ModifyVReg From removeMIAndFixupModifyVRegLI and
fixupModifyVRegLIAfterInsertMI
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 22 ++++++++++----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 815ef527a81072..a17777d6fd15ba 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1077,8 +1077,7 @@ static void getVRegFromMI(MachineInstr *MI, SmallVector<Register> &VRegs) {
}
}
-static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
- LiveIntervals *LIS) {
+static void fixupLIAfterInsertMI(MachineInstr *MI, LiveIntervals *LIS) {
if (!LIS)
return;
@@ -1094,7 +1093,7 @@ static void fixupModifyVRegLIAfterInsertMI(MachineInstr *MI,
fixupModifyVRegLI(VReg, LIS);
}
-static void removeMIAndFixupModifyVRegLI(MachineInstr *MI, LiveIntervals *LIS) {
+static void removeMIAndFixupLI(MachineInstr *MI, LiveIntervals *LIS) {
SmallVector<Register> NeedFixupVReg;
getVRegFromMI(MI, NeedFixupVReg);
@@ -1124,7 +1123,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE())
.addReg(RISCV::VL, RegState::Implicit);
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1136,11 +1135,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ auto NeedFixupMI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE())
.addReg(RISCV::VL, RegState::Implicit);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
}
@@ -1153,7 +1153,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addImm(Info.getAVLImm())
.addImm(Info.encodeVTYPE());
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1168,7 +1168,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE())
.addReg(RISCV::VL, RegState::Implicit);
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
// Otherwise use an AVL of 1 to avoid depending on previous vl.
@@ -1177,7 +1177,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addImm(1)
.addImm(Info.encodeVTYPE());
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1196,7 +1196,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addReg(AVLReg)
.addImm(Info.encodeVTYPE());
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
}
static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
@@ -1574,7 +1574,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// dead now.
if (VLOpDef && TII->isAddImmediate(*VLOpDef, Reg) &&
MRI->use_nodbg_empty(Reg))
- removeMIAndFixupModifyVRegLI(VLOpDef, LIS);
+ removeMIAndFixupLI(VLOpDef, LIS);
if (IsVirtVLOpReg)
fixupModifyVRegLI(VLOpReg, LIS);
}
@@ -1891,7 +1891,7 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
if (!MRI->use_nodbg_empty(VLOutput)) {
auto NeedFixupMI = BuildMI(MBB, I, MI.getDebugLoc(),
TII->get(RISCV::PseudoReadVL), VLOutput);
- fixupModifyVRegLIAfterInsertMI(NeedFixupMI, LIS);
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
}
// We don't use the vl output of the VLEFF/VLSEGFF anymore.
MI.getOperand(1).setReg(RISCV::X0);
>From 5e6132dec7b4dca648a05ac65a1db81cf2488100 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 00:09:36 -0700
Subject: [PATCH 06/19] Resolver some rebase conflict
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 3 ++-
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a17777d6fd15ba..20867313ec7d75 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1520,6 +1520,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
// to insert a VSETVLI.
return false;
}
+}
void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
VSETVLIInfo CurInfo = BlockInfo[MBB.getNumber()].Pred;
@@ -1776,7 +1777,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
if (!isVectorConfigInstr(MI)) {
- Used.doUnion(getDemanded(MI, MRI, ST));
+ Used.doUnion(getDemanded(MI, MRI, ST, LIS));
if (MI.isCall() || MI.isInlineAsm() ||
MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) ||
MI.modifiesRegister(RISCV::VTYPE, /*TRI=*/nullptr))
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index a99db1810295b7..44d808049c9096 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -543,7 +543,7 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None)
addPass(createRISCVMergeBaseOffsetOptPass());
- if (!EnableSplitRegAlloc || !EnableVSETVLIAfterRVVRegAlloc)
+ if (!EnableVSETVLIAfterRVVRegAlloc)
addPass(createRISCVInsertVSETVLIPass());
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableRISCVDeadRegisterElimination)
>From c2a614e02426f0ca053e19d84684b2986726e344 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 00:25:04 -0700
Subject: [PATCH 07/19] Fix format
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 20867313ec7d75..e803cd6851e3cb 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1454,7 +1454,6 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
return true;
if (!MRI->isSSA()) {
-
// For O0
if (!LIS)
return true;
@@ -1486,6 +1485,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
!DefInfo.hasSameVTYPE(PBBInfo.Exit))
return true;
}
+ return false;
} else {
// We need the AVL to be produce by a PHI node in this basic block.
const MachineInstr *PHI = &Require.getAVLDefMI();
@@ -1516,9 +1516,9 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
return true;
}
- // If all the incoming values to the PHI checked out, we don't need
- // to insert a VSETVLI.
- return false;
+ // If all the incoming values to the PHI checked out, we don't need
+ // to insert a VSETVLI.
+ return false;
}
}
>From 98bf4dee24a5959642f83e19f2459abb628047c1 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 00:31:01 -0700
Subject: [PATCH 08/19] Extract return from if else
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index e803cd6851e3cb..ce8c7f47ee321c 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1485,7 +1485,6 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
!DefInfo.hasSameVTYPE(PBBInfo.Exit))
return true;
}
- return false;
} else {
// We need the AVL to be produce by a PHI node in this basic block.
const MachineInstr *PHI = &Require.getAVLDefMI();
@@ -1515,11 +1514,10 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
if (PBBExit.isUnknown() || !PBBExit.hasSameVTYPE(Require))
return true;
}
-
- // If all the incoming values to the PHI checked out, we don't need
- // to insert a VSETVLI.
- return false;
}
+ // If all the incoming values to the PHI checked out, we don't need
+ // to insert a VSETVLI.
+ return false;
}
void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
>From 993fe2c18eb7baa12aeb03f621c4dee863368393 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 00:59:08 -0700
Subject: [PATCH 09/19] Can build but setAVLRegDef may encounter nullptr
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 39 +++++++++++--------
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 2 +-
2 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index ce8c7f47ee321c..8d8eebf4dcdd77 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -926,7 +926,8 @@ INITIALIZE_PASS(RISCVCoalesceVSETVLI, "riscv-coalesce-vsetvli",
// Return a VSETVLIInfo representing the changes made by this VSETVLI or
// VSETIVLI instruction.
static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
VSETVLIInfo NewInfo;
if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
NewInfo.setAVLImm(MI.getOperand(1).getImm());
@@ -939,7 +940,7 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
if (AVLReg == RISCV::X0)
NewInfo.setAVLVLMAX();
else
- NewInfo.setAVLRegDef(MRI.getVRegDef(AVLReg), AVLReg);
+ NewInfo.setAVLRegDef(getReachingDefMI(AVLReg, &MI, &MRI, LIS), AVLReg);
}
NewInfo.setVTYPE(MI.getOperand(2).getImm());
@@ -1012,7 +1013,8 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
else
InstrInfo.setAVLImm(Imm);
} else {
- InstrInfo.setAVLRegDef(MRI->getVRegDef(VLOp.getReg()), VLOp.getReg());
+ InstrInfo.setAVLRegDef(getReachingDefMI(VLOp.getReg(), &MI, MRI, LIS),
+ VLOp.getReg());
}
} else {
assert(isScalarExtractInstr(MI));
@@ -1035,7 +1037,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
if (InstrInfo.hasAVLReg()) {
const MachineInstr &DefMI = InstrInfo.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
- VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI);
+ VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
(DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
InstrInfo.setAVL(DefInstrInfo);
@@ -1133,7 +1135,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg()) {
const MachineInstr &DefMI = Info.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
- VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
auto NeedFixupMI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
@@ -1183,10 +1185,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
if (Info.hasAVLVLMAX()) {
Register DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
- BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(DestReg, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE());
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE());
+ fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1262,7 +1266,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
const MachineInstr &DefMI = Require.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
- VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
return false;
}
@@ -1351,14 +1355,15 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
const MachineInstr &MI) const {
if (isVectorConfigInstr(MI)) {
- Info = getInfoForVSETVLI(MI, *MRI);
+ Info = getInfoForVSETVLI(MI, *MRI, LIS);
return;
}
if (RISCV::isFaultFirstLoad(MI)) {
// Update AVL to vl-output of the fault first load.
- Info.setAVLRegDef(MRI->getVRegDef(MI.getOperand(1).getReg()),
- MI.getOperand(1).getReg());
+ Info.setAVLRegDef(
+ getReachingDefMI(MI.getOperand(1).getReg(), &MI, MRI, LIS),
+ MI.getOperand(1).getReg());
return;
}
@@ -1480,7 +1485,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
if (!DefMI || !isVectorConfigInstr(*DefMI))
return true;
- VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI, LIS);
if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
!DefInfo.hasSameVTYPE(PBBInfo.Exit))
return true;
@@ -1504,7 +1509,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
// We found a VSET(I)VLI make sure it matches the output of the
// predecessor block.
- VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+ VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI, LIS);
if (DefInfo != PBBExit)
return true;
@@ -1741,8 +1746,8 @@ static bool canMutatePriorConfig(
if (Used.VLZeroness) {
if (isVLPreservingConfig(PrevMI))
return false;
- if (!getInfoForVSETVLI(PrevMI, MRI)
- .hasEquallyZeroAVL(getInfoForVSETVLI(MI, MRI)))
+ if (!getInfoForVSETVLI(PrevMI, MRI, LIS)
+ .hasEquallyZeroAVL(getInfoForVSETVLI(MI, MRI, LIS)))
return false;
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
index 65270ea3a40eee..ab1628370ffcfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b -target-abi=lp64d \
-; RUN: --riscv-split-regalloc=1 -riscv-vsetvli-after-rvv-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: -riscv-vsetvli-after-rvv-regalloc=1 -verify-machineinstrs < %s | FileCheck %s
define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb) nounwind strictfp {
; CHECK-LABEL: fcmp_ole_vv_nxv2f16:
>From fcf9db3bf040e992f97819601b6935d50f1fb09b Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 01:13:03 -0700
Subject: [PATCH 10/19] Recover coalesceVSETVLIs
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 22 +-------------------
1 file changed, 1 insertion(+), 21 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 8d8eebf4dcdd77..cb35ef728ed843 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1775,8 +1775,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
DemandedFields Used;
Used.demandVL();
Used.demandVTYPE();
- SmallVector<MachineInstr*> ToDelete;
- SmallVector<MachineInstr *> MIInBetween;
+ SmallVector<MachineInstr *> ToDelete;
for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
if (!isVectorConfigInstr(MI)) {
@@ -1794,24 +1793,6 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
Used.demandVL();
if (NextMI) {
-
- // A tail undefined vmv.v.i/x or vfmv.v.f with VL=1 can be treated in the
- // same semantically as vmv.s.x.
- if (MIInBetween.size() == 1 && isScalarSplatInstr(*MIInBetween[0]) &&
- MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 1 &&
- isLMUL1OrSmaller(RISCVVType::getVLMUL(MI.getOperand(2).getImm())) &&
- hasUndefinedMergeOp(*MIInBetween[0], *MRI, LIS)) {
- Used.LMUL = false;
- Used.SEWLMULRatio = false;
- Used.VLAny = false;
- if (isFloatScalarMoveOrScalarSplatInstr(*MIInBetween[0]) &&
- !ST->hasVInstructionsF64())
- Used.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
- else
- Used.SEW = DemandedFields::SEWGreaterThanOrEqual;
- Used.TailPolicy = false;
- }
-
if (!Used.usedVL() && !Used.usedVTYPE()) {
ToDelete.push_back(&MI);
// Leave NextMI unchanged
@@ -1874,7 +1855,6 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
}
NextMI = &MI;
Used = getDemanded(MI, MRI, ST, LIS);
- MIInBetween.clear();
}
NumCoalescedVSETVL += ToDelete.size();
>From 68f3c74db7df36fa0843aa57681db857db0aa99f Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 01:35:27 -0700
Subject: [PATCH 11/19] Handle AVLReg come from PHI
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 22 +++++----
llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll | 46 ++++++-------------
2 files changed, 27 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cb35ef728ed843..8a67b11462b722 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -533,7 +533,7 @@ class VSETVLIInfo {
bool isUnknown() const { return State == Unknown; }
void setAVLRegDef(const MachineInstr *DefMI, Register AVLReg) {
- assert(DefMI && AVLReg.isVirtual());
+ assert(AVLReg.isVirtual());
AVLRegDef.DefMI = DefMI;
AVLRegDef.DefReg = AVLReg;
State = AVLIsReg;
@@ -550,6 +550,7 @@ class VSETVLIInfo {
bool hasAVLImm() const { return State == AVLIsImm; }
bool hasAVLReg() const { return State == AVLIsReg; }
+ bool hasAVLRegDefMI() const { return AVLRegDef.DefMI != nullptr; }
bool hasAVLVLMAX() const { return State == AVLIsVLMAX; }
bool hasAVLIgnored() const { return State == AVLIsIgnored; }
Register getAVLReg() const {
@@ -570,7 +571,8 @@ class VSETVLIInfo {
if (Info.isUnknown())
setUnknown();
else if (Info.hasAVLReg())
- setAVLRegDef(&Info.getAVLDefMI(), Info.getAVLReg());
+ setAVLRegDef(Info.hasAVLRegDefMI() ? &Info.getAVLDefMI() : nullptr,
+ Info.getAVLReg());
else if (Info.hasAVLVLMAX())
setAVLVLMAX();
else if (Info.hasAVLIgnored())
@@ -590,7 +592,7 @@ class VSETVLIInfo {
if (hasAVLImm())
return getAVLImm() > 0;
if (hasAVLReg())
- return isNonZeroLoadImmediate(getAVLDefMI());
+ return hasAVLRegDefMI() && isNonZeroLoadImmediate(getAVLDefMI());
if (hasAVLVLMAX())
return true;
if (hasAVLIgnored())
@@ -606,7 +608,9 @@ class VSETVLIInfo {
bool hasSameAVL(const VSETVLIInfo &Other) const {
if (hasAVLReg() && Other.hasAVLReg())
- return getAVLDefMI().isIdenticalTo(Other.getAVLDefMI()) &&
+ return ((!hasAVLRegDefMI() && !Other.hasAVLRegDefMI()) ||
+ (hasAVLRegDefMI() == Other.hasAVLRegDefMI() &&
+ getAVLDefMI().isIdenticalTo(Other.getAVLDefMI()))) &&
getAVLReg() == Other.getAVLReg();
if (hasAVLImm() && Other.hasAVLImm())
@@ -1034,7 +1038,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
// AVL operand with the AVL of the defining vsetvli. We avoid general
// register AVLs to avoid extending live ranges without being sure we can
// kill the original source reg entirely.
- if (InstrInfo.hasAVLReg()) {
+ if (InstrInfo.hasAVLReg() && InstrInfo.hasAVLRegDefMI()) {
const MachineInstr &DefMI = InstrInfo.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1132,7 +1136,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// If our AVL is a virtual register, it might be defined by a VSET(I)VLI. If
// it has the same VLMAX we want and the last VL/VTYPE we observed is the
// same, we can use the X0, X0 form.
- if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg()) {
+ if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg() &&
+ Info.hasAVLRegDefMI()) {
const MachineInstr &DefMI = Info.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1263,7 +1268,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
// and the last VL/VTYPE we observed is the same, we don't need a
// VSETVLI here.
- if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
+ if (Require.hasAVLReg() && Require.hasAVLRegDefMI() &&
+ CurInfo.hasCompatibleVTYPE(Used, Require)) {
const MachineInstr &DefMI = Require.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1666,7 +1672,7 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// If the AVL value is a register (other than our VLMAX sentinel),
// we need to prove the value is available at the point we're going
// to insert the vsetvli at.
- if (AvailableInfo.hasAVLReg()) {
+ if (AvailableInfo.hasAVLReg() && AvailableInfo.hasAVLRegDefMI()) {
const MachineInstr *AVLDefMI = &AvailableInfo.getAVLDefMI();
// This is an inline dominance check which covers the case of
// UnavailablePred being the preheader of a loop.
diff --git a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
index ab1628370ffcfc..d9af55e29de53d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splitRA-vsetvl.ll
@@ -1469,57 +1469,36 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vmv8r.v v16, v8
; CHECK-NEXT: vl8re64.v v8, (a1)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsoxei64.v v16, (zero), v8, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsoxei64.v v8, (zero), v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -1854,8 +1833,9 @@ define <4 x float> @tail_vmv_v_i_treat_as_vmv_s_x(<8 x float> %x, <8 x float> %y
; CHECK-NEXT: vmul.vx v14, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vadd.vi v8, v14, -14
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
>From 64eedf3251daa2e1f372a6347e12c4fcc8b5b651 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Fri, 26 Apr 2024 07:15:26 -0700
Subject: [PATCH 12/19] Add assert in getReachingDefMI
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 24 +++++++++-----------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 8a67b11462b722..660a6c4f0b5d75 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/ErrorHandling.h"
#include <queue>
using namespace llvm;
@@ -57,6 +58,7 @@ namespace {
template <typename T>
static T *getReachingDefMI(Register Reg, T *MI, const MachineRegisterInfo *MRI,
const LiveIntervals *LIS) {
+
if (MRI->isSSA() || !MI)
return MRI->getUniqueVRegDef(Reg);
@@ -70,20 +72,16 @@ static T *getReachingDefMI(Register Reg, T *MI, const MachineRegisterInfo *MRI,
}))
return MI;
- if (Reg.isVirtual() && LIS->hasInterval(Reg)) {
- auto &LI = LIS->getInterval(Reg);
- SlotIndexes *SIs = LIS->getSlotIndexes();
- SlotIndex SI = SIs->getInstructionIndex(*MI);
- VNInfo *Valno = LI.getVNInfoBefore(SI);
- if (!Valno || Valno->isPHIDef())
- return nullptr;
- MachineInstr *DefMI = SIs->getInstructionFromIndex(Valno->def);
- return DefMI;
- }
+ assert(Reg.isVirtual() && LIS->hasInterval(Reg));
- // TODO: Handle physical register
-
- return nullptr;
+ auto &LI = LIS->getInterval(Reg);
+ SlotIndexes *SIs = LIS->getSlotIndexes();
+ SlotIndex SI = SIs->getInstructionIndex(*MI);
+ VNInfo *Valno = LI.getVNInfoBefore(SI);
+ if (!Valno || Valno->isPHIDef())
+ return nullptr;
+ MachineInstr *DefMI = SIs->getInstructionFromIndex(Valno->def);
+ return DefMI;
}
static unsigned getVLOpNum(const MachineInstr &MI) {
>From 270c377d4b8ced4a314d3399e02d806048b5d68b Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 03:08:12 -0700
Subject: [PATCH 13/19] Handle FaultFirstLoad could use x0 as VL-output
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 660a6c4f0b5d75..2828341b4f787a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1365,9 +1365,12 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
if (RISCV::isFaultFirstLoad(MI)) {
// Update AVL to vl-output of the fault first load.
- Info.setAVLRegDef(
- getReachingDefMI(MI.getOperand(1).getReg(), &MI, MRI, LIS),
- MI.getOperand(1).getReg());
+ if (MI.getOperand(1).getReg() == RISCV::X0)
+ Info.setAVLVLMAX();
+ else
+ Info.setAVLRegDef(
+ getReachingDefMI(MI.getOperand(1).getReg(), &MI, MRI, LIS),
+ MI.getOperand(1).getReg());
return;
}
>From 22f6714d9c6c9a32867a10162436b7122075eb5a Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 03:13:42 -0700
Subject: [PATCH 14/19] Remove useless include
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 2828341b4f787a..beba2684b47ab0 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/ErrorHandling.h"
#include <queue>
using namespace llvm;
>From 8e496696ab9b93dbdda54666bb687680fa6adf03 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 03:20:25 -0700
Subject: [PATCH 15/19] Update format
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index beba2684b47ab0..d8cc2ebc82d6f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1139,11 +1139,12 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
- auto NeedFixupMI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ auto NeedFixupMI =
+ BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE())
+ .addReg(RISCV::VL, RegState::Implicit);
fixupLIAfterInsertMI(NeedFixupMI, LIS);
return;
}
@@ -1738,10 +1739,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// Return true if we can mutate PrevMI to match MI without changing any the
// fields which would be observed.
-static bool canMutatePriorConfig(
- const MachineInstr &PrevMI, const MachineInstr &MI,
- const DemandedFields &Used, const MachineRegisterInfo &MRI,
- const LiveIntervals *LIS) {
+static bool canMutatePriorConfig(const MachineInstr &PrevMI,
+ const MachineInstr &MI,
+ const DemandedFields &Used,
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS) {
// If the VL values aren't equal, return false if either a) the former is
// demanded, or b) we can't rewrite the former to be the later for
// implementation reasons.
>From 9271d5df904d90534f0d92a25a8fa872d0e62a8b Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 03:56:26 -0700
Subject: [PATCH 16/19] Make hasAVLRegDefMI also contain hasAVLReg
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index d8cc2ebc82d6f8..d065658c187086 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -547,7 +547,9 @@ class VSETVLIInfo {
bool hasAVLImm() const { return State == AVLIsImm; }
bool hasAVLReg() const { return State == AVLIsReg; }
- bool hasAVLRegDefMI() const { return AVLRegDef.DefMI != nullptr; }
+ bool hasAVLRegDefMI() const {
+ return hasAVLReg() && (AVLRegDef.DefMI != nullptr);
+ }
bool hasAVLVLMAX() const { return State == AVLIsVLMAX; }
bool hasAVLIgnored() const { return State == AVLIsIgnored; }
Register getAVLReg() const {
@@ -1035,7 +1037,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
// AVL operand with the AVL of the defining vsetvli. We avoid general
// register AVLs to avoid extending live ranges without being sure we can
// kill the original source reg entirely.
- if (InstrInfo.hasAVLReg() && InstrInfo.hasAVLRegDefMI()) {
+ if (InstrInfo.hasAVLRegDefMI()) {
const MachineInstr &DefMI = InstrInfo.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1133,8 +1135,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
// If our AVL is a virtual register, it might be defined by a VSET(I)VLI. If
// it has the same VLMAX we want and the last VL/VTYPE we observed is the
// same, we can use the X0, X0 form.
- if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg() &&
- Info.hasAVLRegDefMI()) {
+ if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLRegDefMI()) {
const MachineInstr &DefMI = Info.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1266,8 +1267,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
// it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
// and the last VL/VTYPE we observed is the same, we don't need a
// VSETVLI here.
- if (Require.hasAVLReg() && Require.hasAVLRegDefMI() &&
- CurInfo.hasCompatibleVTYPE(Used, Require)) {
+ if (Require.hasAVLRegDefMI() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
const MachineInstr &DefMI = Require.getAVLDefMI();
if (isVectorConfigInstr(DefMI)) {
VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI, LIS);
@@ -1673,7 +1673,7 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
// If the AVL value is a register (other than our VLMAX sentinel),
// we need to prove the value is available at the point we're going
// to insert the vsetvli at.
- if (AvailableInfo.hasAVLReg() && AvailableInfo.hasAVLRegDefMI()) {
+ if (AvailableInfo.hasAVLRegDefMI()) {
const MachineInstr *AVLDefMI = &AvailableInfo.getAVLDefMI();
// This is an inline dominance check which covers the case of
// UnavailablePred being the preheader of a loop.
>From add9f2c2b85561d3a86dbda8b01a96b05a72acf9 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 04:59:49 -0700
Subject: [PATCH 17/19] Replace hasAVLReg with hasAVLRegDefMI inside
hasNonZeroAVL
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index d065658c187086..edefdc4056bc31 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -590,8 +590,8 @@ class VSETVLIInfo {
bool hasNonZeroAVL() const {
if (hasAVLImm())
return getAVLImm() > 0;
- if (hasAVLReg())
- return hasAVLRegDefMI() && isNonZeroLoadImmediate(getAVLDefMI());
+ if (hasAVLRegDefMI())
+ return isNonZeroLoadImmediate(getAVLDefMI());
if (hasAVLVLMAX())
return true;
if (hasAVLIgnored())
>From e504d987f282027c6f694611486e9ca4727e0e34 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 05:21:40 -0700
Subject: [PATCH 18/19] Split hasAVLRegDefMI and hasAVLReg into two condition
in hasSameAVL
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index edefdc4056bc31..ea722c3474f2b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -606,12 +606,13 @@ class VSETVLIInfo {
}
bool hasSameAVL(const VSETVLIInfo &Other) const {
- if (hasAVLReg() && Other.hasAVLReg())
- return ((!hasAVLRegDefMI() && !Other.hasAVLRegDefMI()) ||
- (hasAVLRegDefMI() == Other.hasAVLRegDefMI() &&
- getAVLDefMI().isIdenticalTo(Other.getAVLDefMI()))) &&
+ if (hasAVLRegDefMI() && Other.hasAVLRegDefMI())
+ return getAVLDefMI().isIdenticalTo(Other.getAVLDefMI()) &&
getAVLReg() == Other.getAVLReg();
+ if (hasAVLReg() && Other.hasAVLReg())
+ return getAVLReg() == Other.getAVLReg();
+
if (hasAVLImm() && Other.hasAVLImm())
return getAVLImm() == Other.getAVLImm();
>From 4fb692eb993dd4d3f36756a7d6ad445707288273 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 29 Apr 2024 05:34:39 -0700
Subject: [PATCH 19/19] Remove MI from needVSETVLIPHI due to we can get from
VSETVLInfo
---
llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index ea722c3474f2b0..509313d5a4c122 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -867,8 +867,8 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
private:
bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
const VSETVLIInfo &CurInfo) const;
- bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB,
- const MachineInstr &MI) const;
+ bool needVSETVLIPHI(const VSETVLIInfo &Require,
+ const MachineBasicBlock &MBB) const;
void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo);
void insertVSETVLI(MachineBasicBlock &MBB,
@@ -1458,8 +1458,7 @@ void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
// be unneeded if the AVL is a phi node where all incoming values are VL
// outputs from the last VSETVLI in their respective basic blocks.
bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
- const MachineBasicBlock &MBB,
- const MachineInstr &MI) const {
+ const MachineBasicBlock &MBB) const {
if (DisableInsertVSETVLPHIOpt)
return true;
@@ -1473,7 +1472,11 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
LiveRange &LR = LIS->getInterval(Require.getAVLReg());
SlotIndexes *SIs = LIS->getSlotIndexes();
- SlotIndex SI = SIs->getInstructionIndex(MI);
+
+ if (!Require.hasAVLRegDefMI())
+ return true;
+
+ SlotIndex SI = SIs->getInstructionIndex(Require.getAVLDefMI());
VNInfo *Valno = LR.getVNInfoAt(SI);
if (!Valno || !Valno->isPHIDef())
return true;
@@ -1563,7 +1566,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// wouldn't be used and VL/VTYPE registers are correct. Note that
// we *do* need to model the state as if it changed as while the
// register contents are unchanged, the abstract model can change.
- if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB, MI))
+ if (!PrefixTransparent || needVSETVLIPHI(CurInfo, MBB))
insertVSETVLI(MBB, MI, CurInfo, PrevInfo);
PrefixTransparent = false;
}
More information about the llvm-commits
mailing list