[llvm] [RISCV] Use vmv.v.x for any rv32 e64 splat with equal halves (PR #130530)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 9 17:29:27 PDT 2025
https://github.com/preames created https://github.com/llvm/llvm-project/pull/130530
The prior logic was reasoning in terms of vsetivli immediates, but using
the vmv.v.x is strongly profitable for high LMUL cases. The key
difference is that the vmv.v.x form is rematerializeable during
register allocation, and the vsle form is not.
The review contains two changes - the first allows using the register form
of the vsetvli for constants larger than 31. The second instead uses the
vlmax form of the vsetvli. The later has slightly more localized scalar
register pressure impact, but increases VL which some hardware might
not like. I don't have a strong preference between them
and am happy to go with whichever form reviewers prefer.
My main motivation with this patch is eliminating a couple of extremely
constrained register allocation cases from the test list.
>From d95a0f963499e58b22d30650cc317274cb163ec2 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Sun, 9 Mar 2025 16:37:21 -0700
Subject: [PATCH 1/2] [RISCV] Use vmv.v.x for any constant VL rv32 e64 splat
with equal halves
The prior logic was reasoning in terms of vsetivli immediates, but using
the vmv.v.x is strongly profitable for high LMUL cases. The key
difference is that the vmv.v.x form is rematerializeable during
register allocation, and the vsle form is not.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
.../RISCV/rvv/fixed-vectors-bitreverse-vp.ll | 574 +++---
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 1622 ++++++----------
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 650 +++----
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 1648 +++++++----------
5 files changed, 1779 insertions(+), 2717 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 36d113a63cd1d..d4eec5a6a5bed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4372,7 +4372,7 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
(isa<RegisterSDNode>(VL) &&
cast<RegisterSDNode>(VL)->getReg() == RISCV::X0))
NewVL = DAG.getRegister(RISCV::X0, MVT::i32);
- else if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal()))
+ else if (isa<ConstantSDNode>(VL))
NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL);
if (NewVL) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 5ea4924468595..e4c5afc2d8590 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1652,116 +1652,107 @@ declare <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64>, <15 x i1>, i32)
define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: lui a1, 1044480
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmv8r.v v24, v8
-; RV32-NEXT: lui a2, 1044480
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: lui a5, 349525
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: lui a6, 16
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw zero, 20(sp)
-; RV32-NEXT: addi a2, a3, -241
-; RV32-NEXT: sw a2, 40(sp)
-; RV32-NEXT: sw a2, 44(sp)
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: addi a3, a4, 819
-; RV32-NEXT: sw a3, 32(sp)
-; RV32-NEXT: sw a3, 36(sp)
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: addi a4, a5, 1365
-; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a5, a6, -256
-; RV32-NEXT: sw a4, 24(sp)
-; RV32-NEXT: sw a4, 28(sp)
-; RV32-NEXT: vand.vx v8, v8, a5, v0.t
-; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
-; RV32-NEXT: vor.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT: addi a1, a3, -256
+; RV32-NEXT: vand.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
+; RV32-NEXT: vlse64.v v16, (a5), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: lui a3, 4080
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v16, v24, a3, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v24, a1, v0.t
-; RV32-NEXT: vsrl.vx v8, v24, a2, v0.t
-; RV32-NEXT: vand.vx v8, v8, a5, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
-; RV32-NEXT: vand.vx v16, v8, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
+; RV32-NEXT: vand.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vand.vx v24, v24, a1, v0.t
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a3, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 4, v0.t
; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a4
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -1774,8 +1765,8 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1863,91 +1854,83 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: lui a6, 16
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a5
-; RV32-NEXT: vsrl.vx v24, v8, a5
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw zero, 20(sp)
-; RV32-NEXT: addi a1, a2, -241
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: addi a2, a3, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: addi a3, a4, 1365
-; RV32-NEXT: addi a4, a6, -256
-; RV32-NEXT: vsrl.vx v0, v8, a5
-; RV32-NEXT: sw a3, 24(sp)
-; RV32-NEXT: sw a3, 28(sp)
-; RV32-NEXT: vand.vx v0, v0, a4
+; RV32-NEXT: vsll.vx v16, v8, a2
+; RV32-NEXT: addi a1, a3, -256
+; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: vsrl.vx v0, v8, a4
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a4
-; RV32-NEXT: vsll.vx v0, v0, a5
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: vsll.vx v24, v24, a4
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v24, (a6), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a1
+; RV32-NEXT: vand.vx v16, v16, a5
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vor.vv v16, v0, v16
+; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v0, v8, v24
-; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 3
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a2), zero
+; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
@@ -1959,8 +1942,8 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -2048,116 +2031,107 @@ declare <16 x i64> @llvm.vp.bitreverse.v16i64(<16 x i64>, <16 x i1>, i32)
define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: lui a1, 1044480
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmv8r.v v24, v8
-; RV32-NEXT: lui a2, 1044480
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: lui a5, 349525
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: lui a6, 16
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw zero, 20(sp)
-; RV32-NEXT: addi a2, a3, -241
-; RV32-NEXT: sw a2, 40(sp)
-; RV32-NEXT: sw a2, 44(sp)
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: addi a3, a4, 819
-; RV32-NEXT: sw a3, 32(sp)
-; RV32-NEXT: sw a3, 36(sp)
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: addi a4, a5, 1365
-; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a5, a6, -256
-; RV32-NEXT: sw a4, 24(sp)
-; RV32-NEXT: sw a4, 28(sp)
-; RV32-NEXT: vand.vx v8, v8, a5, v0.t
-; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
-; RV32-NEXT: vor.vv v8, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT: addi a1, a3, -256
+; RV32-NEXT: vand.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
+; RV32-NEXT: vlse64.v v16, (a5), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: lui a3, 4080
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v16, v24, a3, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v24, a1, v0.t
-; RV32-NEXT: vsrl.vx v8, v24, a2, v0.t
-; RV32-NEXT: vand.vx v8, v8, a5, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
-; RV32-NEXT: vand.vx v16, v8, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
+; RV32-NEXT: vand.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vand.vx v24, v24, a1, v0.t
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a3, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 4, v0.t
; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a4
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -2170,8 +2144,8 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -2259,91 +2233,83 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_bitreverse_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: lui a6, 16
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a5
-; RV32-NEXT: vsrl.vx v24, v8, a5
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw zero, 20(sp)
-; RV32-NEXT: addi a1, a2, -241
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: addi a2, a3, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: addi a3, a4, 1365
-; RV32-NEXT: addi a4, a6, -256
-; RV32-NEXT: vsrl.vx v0, v8, a5
-; RV32-NEXT: sw a3, 24(sp)
-; RV32-NEXT: sw a3, 28(sp)
-; RV32-NEXT: vand.vx v0, v0, a4
+; RV32-NEXT: vsll.vx v16, v8, a2
+; RV32-NEXT: addi a1, a3, -256
+; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: vsrl.vx v0, v8, a4
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a4
-; RV32-NEXT: vsll.vx v0, v0, a5
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: vsll.vx v24, v24, a4
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v24, (a6), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a1
+; RV32-NEXT: vand.vx v16, v16, a5
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vor.vv v16, v0, v16
+; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v0, v8, v24
-; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 3
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v24, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a2), zero
+; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
@@ -2355,8 +2321,8 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 9d0d42cf754c5..0f6139ec1b65f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1503,41 +1503,17 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -1547,58 +1523,34 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v15i64:
@@ -1655,68 +1607,54 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vx v0, v8, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
-; RV32-NEXT: vand.vv v0, v24, v16
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v15i64_unmasked:
@@ -1775,41 +1713,17 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -1819,58 +1733,34 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v16i64:
@@ -1927,68 +1817,54 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vx v0, v8, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
-; RV32-NEXT: vand.vv v0, v24, v16
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_v16i64_unmasked:
@@ -2047,37 +1923,21 @@ declare <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw a2, 20(sp)
; RV32-NEXT: mv a2, a0
; RV32-NEXT: bltu a0, a1, .LBB34_2
; RV32-NEXT: # %bb.1:
@@ -2086,9 +1946,9 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: addi a4, sp, 32
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -2099,144 +1959,119 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a3, a3, a5
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a4), zero
+; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
+; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a3, a3, a5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -2244,15 +2079,15 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
@@ -2262,71 +2097,55 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -2335,27 +2154,27 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -2478,52 +2297,38 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: li a2, 16
; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a3, .LBB35_2
+; RV32-NEXT: bltu a0, a2, .LBB35_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -2550,90 +2355,74 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 16
-; RV32-NEXT: vor.vv v16, v16, v8
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
+; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v16, a2
+; RV32-NEXT: vsrl.vi v0, v16, 16
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vadd.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vsrl.vi v0, v16, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsub.vv v0, v16, v0
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
+; RV32-NEXT: vand.vv v8, v0, v16
; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v8, v0, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: vnot.v v24, v24
+; RV32-NEXT: vsrl.vi v0, v24, 1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v24, v24, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v24, v16
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -4213,41 +4002,17 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -4257,58 +4022,34 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v15i64:
@@ -4365,68 +4106,54 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vx v0, v8, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
-; RV32-NEXT: vand.vv v0, v24, v16
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
@@ -4483,41 +4210,17 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -4527,58 +4230,34 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v16i64:
@@ -4635,68 +4314,54 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: li a1, 32
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vx v0, v8, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsrl.vi v16, v8, 2
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 4
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v16, v8, 16
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
-; RV32-NEXT: vand.vv v0, v24, v16
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
@@ -4753,37 +4418,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw a2, 20(sp)
; RV32-NEXT: mv a2, a0
; RV32-NEXT: bltu a0, a1, .LBB70_2
; RV32-NEXT: # %bb.1:
@@ -4792,9 +4441,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: addi a4, sp, 32
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -4805,144 +4454,119 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a3, a3, a5
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a4), zero
+; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
+; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a3, a3, a5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
@@ -4950,15 +4574,15 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
@@ -4968,71 +4592,55 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -5041,27 +4649,27 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -5184,52 +4792,38 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: li a2, 16
; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a3, .LBB71_2
+; RV32-NEXT: bltu a0, a2, .LBB71_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB71_2:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -5256,90 +4850,74 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 16
-; RV32-NEXT: vor.vv v16, v16, v8
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
+; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v16, a2
+; RV32-NEXT: vsrl.vi v0, v16, 16
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vadd.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vsrl.vi v0, v16, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsub.vv v0, v16, v0
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
+; RV32-NEXT: vand.vv v8, v0, v16
; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v8, v0, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: vnot.v v24, v24
+; RV32-NEXT: vsrl.vi v0, v24, 1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vsub.vv v24, v24, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v24, v16
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index a5a1061842427..a774d546a715b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1119,71 +1119,41 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v15i64:
@@ -1226,54 +1196,41 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v0
-; RV32-NEXT: vand.vv v0, v8, v24
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v15i64_unmasked:
@@ -1318,71 +1275,41 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v16i64:
@@ -1425,54 +1352,41 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v0
-; RV32-NEXT: vand.vv v0, v8, v24
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v16i64_unmasked:
@@ -1517,129 +1431,100 @@ declare <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64>, <32 x i1>, i32)
define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 48
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a2, 16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a3, .LBB34_2
+; RV32-NEXT: bltu a0, a2, .LBB34_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB34_2:
-; RV32-NEXT: addi a2, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: li a3, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a2, a2, a4
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a2, a2, a4
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: addi a2, sp, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: addi a2, a0, -16
; RV32-NEXT: sltu a0, a0, a2
@@ -1647,53 +1532,58 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsub.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vadd.vv v8, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1789,158 +1679,94 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv8r.v v24, v16
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: mv a1, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: mv a2, a0
; RV32-NEXT: bltu a0, a3, .LBB35_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
+; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB35_2:
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: addi a2, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: addi a2, sp, 32
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vmv8r.v v8, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v24, 1
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v24, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v16, v0
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v8, v0
-; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: sub sp, sp, a3
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v8, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a2), zero
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v0, v8
+; RV32-NEXT: vsrl.vi v0, v0, 2
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: vadd.vv v0, v24, v0
+; RV32-NEXT: addi a3, a0, -16
+; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v24, v16, 1
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v24, v24, v8
+; RV32-NEXT: vsub.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v0, 4
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v24, v0
+; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v8, v24, v8
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v0
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v8, v16, a1
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v24, a1
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 0fb0fbe073e6b..7288aee768ef4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1263,92 +1263,45 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
; RV32-NEXT: li a1, 1
+; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v15i64:
@@ -1395,58 +1348,45 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vnot.v v16, v8
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: vsub.vx v8, v8, a1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v0, v8, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v0
-; RV32-NEXT: vand.vv v0, v16, v24
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v15i64_unmasked:
@@ -1495,92 +1435,45 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
; RV32-NEXT: li a1, 1
+; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v16i64:
@@ -1627,58 +1520,45 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vnot.v v16, v8
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: vsub.vx v8, v8, a1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v0, v8, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v0
-; RV32-NEXT: vand.vv v0, v16, v24
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_v16i64_unmasked:
@@ -1727,263 +1607,245 @@ declare <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: li a1, 16
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v0, 2
; RV32-NEXT: mv a2, a0
; RV32-NEXT: bltu a0, a1, .LBB34_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB34_2:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: addi a3, sp, 40
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: li a3, 32
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a4, a4, 1365
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li a6, 48
+; RV32-NEXT: mul a5, a5, a6
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 209715
+; RV32-NEXT: addi a4, a4, 819
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 61681
+; RV32-NEXT: addi a4, a4, -241
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 4112
+; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -2086,108 +1948,92 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: li a2, 16
; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a3, .LBB35_2
+; RV32-NEXT: bltu a0, a2, .LBB35_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
-; RV32-NEXT: li a2, 1
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: li a3, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: addi a4, a0, -16
+; RV32-NEXT: sltu a0, a0, a4
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: and a0, a0, a4
+; RV32-NEXT: lui a4, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v8, a2
+; RV32-NEXT: vsub.vx v8, v8, a3
+; RV32-NEXT: addi a4, a4, 819
; RV32-NEXT: vand.vv v8, v0, v8
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v0, v16, a2
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: vsub.vx v0, v16, a3
+; RV32-NEXT: vnot.v v16, v16
+; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v24, v24, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v16, v8
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a2), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v24, v16
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: lui a4, 4112
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v0
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v0
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -3495,92 +3341,45 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
; RV32-NEXT: li a1, 1
+; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v15i64:
@@ -3627,58 +3426,45 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vnot.v v16, v8
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: vsub.vx v8, v8, a1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v0, v8, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v0
-; RV32-NEXT: vand.vv v0, v16, v24
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
@@ -3725,92 +3511,45 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
; RV32-NEXT: li a1, 1
+; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v16i64:
@@ -3857,58 +3596,45 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a1
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: lui a1, 209715
-; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 4112
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vnot.v v16, v8
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: vsub.vx v8, v8, a1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v0, v8, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v0
-; RV32-NEXT: vand.vv v0, v16, v24
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a0, 56
; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 32
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
@@ -3955,263 +3681,245 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v32i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: li a1, 16
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: sw a2, 16(sp)
-; RV32-NEXT: sw a2, 20(sp)
+; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v0, 2
; RV32-NEXT: mv a2, a0
; RV32-NEXT: bltu a0, a1, .LBB70_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB70_2:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: addi a3, sp, 40
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: li a3, 32
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: addi a4, a4, 1365
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li a6, 48
+; RV32-NEXT: mul a5, a5, a6
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
+; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 209715
+; RV32-NEXT: addi a4, a4, 819
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 61681
+; RV32-NEXT: addi a4, a4, -241
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a4, 4112
+; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -4314,108 +4022,92 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: li a2, 16
; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a3, .LBB71_2
+; RV32-NEXT: bltu a0, a2, .LBB71_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB71_2:
-; RV32-NEXT: li a2, 1
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: li a3, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: addi a4, a0, -16
+; RV32-NEXT: sltu a0, a0, a4
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: and a0, a0, a4
+; RV32-NEXT: lui a4, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v8, a2
+; RV32-NEXT: vsub.vx v8, v8, a3
+; RV32-NEXT: addi a4, a4, 819
; RV32-NEXT: vand.vv v8, v0, v8
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v0, v16, a2
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: vsub.vx v0, v16, a3
+; RV32-NEXT: vnot.v v16, v16
+; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v24, v24, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v16, v8
-; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a2), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a3), zero
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v24, v16
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v16, v24, v16
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: lui a4, 4112
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v0
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v0
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
>From 1f06368d85d7b82437782d4c8ad8341393b06209 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Sun, 9 Mar 2025 17:17:25 -0700
Subject: [PATCH 2/2] Switch to using vlmax form
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 28 +-
.../RISCV/rvv/fixed-vectors-bitreverse-vp.ll | 96 ++-
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 472 ++++++------
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 247 +++---
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 726 +++++++++---------
5 files changed, 776 insertions(+), 793 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d4eec5a6a5bed..27a4bbce1f5fc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4361,27 +4361,19 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
if ((LoC >> 31) == HiC)
return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
- // If vl is equal to VLMAX or fits in 4 bits and Hi constant is equal to Lo,
- // we could use vmv.v.x whose EEW = 32 to lower it. This allows us to use
- // vlmax vsetvli or vsetivli to change the VL.
- // FIXME: Support larger constants?
- // FIXME: Support non-constant VLs by saturating?
+ // Use vmv.v.x with EEW=32. Use either a vsetivli or vsetvli to change
+ // VL. This can temporarily increase VL if VL less than VLMAX.
if (LoC == HiC) {
SDValue NewVL;
- if (isAllOnesConstant(VL) ||
- (isa<RegisterSDNode>(VL) &&
- cast<RegisterSDNode>(VL)->getReg() == RISCV::X0))
- NewVL = DAG.getRegister(RISCV::X0, MVT::i32);
- else if (isa<ConstantSDNode>(VL))
+ if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal()))
NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL);
-
- if (NewVL) {
- MVT InterVT =
- MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
- auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT,
- DAG.getUNDEF(InterVT), Lo, NewVL);
- return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
- }
+ else
+ NewVL = DAG.getRegister(RISCV::X0, MVT::i32);
+ MVT InterVT =
+ MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
+ auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT,
+ DAG.getUNDEF(InterVT), Lo, NewVL);
+ return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index e4c5afc2d8590..a72e21c55fa95 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1725,13 +1725,12 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
@@ -1743,16 +1742,16 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 4, v0.t
; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -1902,35 +1901,34 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
@@ -2104,13 +2102,12 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
@@ -2122,16 +2119,16 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vand.vv v16, v16, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 4, v0.t
; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -2281,35 +2278,34 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: lui a1, 61681
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: lui a4, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 3
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 0f6139ec1b65f..a973fb7efff33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1504,48 +1504,48 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV32-LABEL: vp_ctlz_v15i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vnot.v v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v24, v16, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -1609,15 +1609,13 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 4
@@ -1627,29 +1625,31 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v16, v8, 16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -1714,48 +1714,48 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV32-LABEL: vp_ctlz_v16i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vnot.v v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v24, v16, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -1819,15 +1819,13 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 4
@@ -1837,29 +1835,31 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v16, v8, 16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -1959,7 +1959,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -2004,7 +2004,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 48
@@ -2045,7 +2045,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
@@ -2057,7 +2057,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
@@ -2314,7 +2314,7 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: li a2, 32
; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a3, a3, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
@@ -2366,10 +2366,10 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v8, v16, a2
; RV32-NEXT: vor.vv v24, v16, v8
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v0, v16
; RV32-NEXT: vsrl.vi v0, v0, 2
@@ -2378,36 +2378,36 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v24, v24
; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v0, v0, v16
; RV32-NEXT: vsub.vv v24, v24, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vadd.vv v16, v0, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: lui a3, 4112
-; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -4003,48 +4003,48 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vnot.v v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v24, v16, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -4108,15 +4108,13 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 4
@@ -4126,29 +4124,31 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v16, v8, 16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -4211,48 +4211,48 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vnot.v v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vnot.v v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v24, v16, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -4316,15 +4316,13 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
; RV32: # %bb.0:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 32
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsrl.vi v16, v8, 2
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vi v16, v8, 4
@@ -4334,29 +4332,31 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v16, v8, 16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -4454,7 +4454,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -4499,7 +4499,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 48
@@ -4540,7 +4540,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 24
@@ -4552,7 +4552,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
@@ -4809,7 +4809,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: li a2, 32
; RV32-NEXT: lui a3, 349525
; RV32-NEXT: addi a3, a3, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
@@ -4861,10 +4861,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v8, v16, a2
; RV32-NEXT: vor.vv v24, v16, v8
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a3
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v0, v16
; RV32-NEXT: vsrl.vi v0, v0, 2
@@ -4873,36 +4873,36 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v24, v24
; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v0, v0, v16
; RV32-NEXT: vsub.vv v24, v24, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vadd.vv v16, v0, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: lui a3, 4112
-; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index a774d546a715b..78421e8488600 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1122,15 +1122,14 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
@@ -1139,7 +1138,7 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
@@ -1147,7 +1146,7 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1199,15 +1198,14 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
@@ -1216,7 +1214,7 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
@@ -1224,7 +1222,7 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
@@ -1278,15 +1276,14 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
@@ -1295,7 +1292,7 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a1
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
@@ -1303,7 +1300,7 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1355,15 +1352,14 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
-; RV32-NEXT: li a2, 32
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
@@ -1372,7 +1368,7 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
@@ -1380,7 +1376,7 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
@@ -1455,29 +1451,28 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: li a3, 32
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a2, a2, a4
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a2, a2, a4
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1486,23 +1481,46 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 4
@@ -1513,7 +1531,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: addi a2, sp, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
@@ -1540,27 +1558,38 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -1679,95 +1708,71 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: li a3, 16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a3, .LBB35_2
-; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a2, 16
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: bltu a0, a2, .LBB35_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: sub sp, sp, a3
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsub.vv v0, v8, v0
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v0, v24, v0
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v0, a2
+; RV32-NEXT: addi a2, a0, -16
+; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v16, 1
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vsub.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v0, 4
-; RV32-NEXT: vadd.vv v16, v0, v16
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v8, v24, v8
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: lui a3, 4112
-; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsub.vv v16, v16, v24
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v8, v0
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vsrl.vi v16, v16, 2
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: lui a2, 4112
+; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v8, v8, a2
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vp_ctpop_v32i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 7288aee768ef4..a3eaf37631481 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1264,39 +1264,38 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV32-LABEL: vp_cttz_v15i64:
; RV32: # %bb.0:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -1351,37 +1350,36 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v16, v8
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsub.vx v8, v8, a1
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -1436,39 +1434,38 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV32-LABEL: vp_cttz_v16i64:
; RV32: # %bb.0:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -1523,37 +1520,36 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v16, v8
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsub.vx v8, v8, a1
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -1628,118 +1624,117 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB34_2:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a3, 32
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 48
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 61681
-; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
@@ -1960,42 +1955,42 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a3, 1
+; RV32-NEXT: li a2, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
-; RV32-NEXT: addi a4, a0, -16
-; RV32-NEXT: sltu a0, a0, a4
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: addi a3, a0, -16
+; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a4
-; RV32-NEXT: lui a4, 209715
+; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v8, a3
-; RV32-NEXT: addi a4, a4, 819
+; RV32-NEXT: vsub.vx v8, v8, a2
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vand.vv v8, v0, v8
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v0, v16, a3
+; RV32-NEXT: vsub.vx v0, v16, a2
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
; RV32-NEXT: vsrl.vi v16, v0, 1
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2005,21 +2000,21 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vadd.vv v16, v0, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -3342,39 +3337,38 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV32-LABEL: vp_cttz_zero_undef_v15i64:
; RV32: # %bb.0:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -3429,37 +3423,36 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v16, v8
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsub.vx v8, v8, a1
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -3512,39 +3505,38 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV32-LABEL: vp_cttz_zero_undef_v16i64:
; RV32: # %bb.0:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8, v0.t
@@ -3599,37 +3591,36 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v16, v8
-; RV32-NEXT: lui a2, 349525
; RV32-NEXT: vsub.vx v8, v8, a1
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: vand.vv v24, v16, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a1
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vmul.vv v8, v8, v24
@@ -3702,118 +3693,117 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB70_2:
; RV32-NEXT: li a1, 1
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a3, 32
+; RV32-NEXT: lui a3, 349525
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 48
-; RV32-NEXT: mul a5, a5, a6
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 209715
-; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 61681
-; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
@@ -4034,42 +4024,42 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a3, 1
+; RV32-NEXT: li a2, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: lui a4, 349525
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
-; RV32-NEXT: addi a4, a0, -16
-; RV32-NEXT: sltu a0, a0, a4
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: addi a3, a0, -16
+; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a4
-; RV32-NEXT: lui a4, 209715
+; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v8, a3
-; RV32-NEXT: addi a4, a4, 819
+; RV32-NEXT: vsub.vx v8, v8, a2
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vand.vv v8, v0, v8
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v0, v16, a3
+; RV32-NEXT: vsub.vx v0, v16, a2
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
; RV32-NEXT: vsrl.vi v16, v0, 1
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v16
; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -4079,21 +4069,21 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: lui a4, 4112
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: addi a4, a4, 257
+; RV32-NEXT: lui a2, 61681
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a2, a2, -241
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vadd.vv v16, v0, v16
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a4
+; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
More information about the llvm-commits
mailing list