[llvm] [SelectionDAG] Expand CTTZ_ELTS[_ZERO_POISON] and handle splitting (PR #185605)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 24 01:09:19 PDT 2026
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/185605
>From 4de39a9b2d0e38a6fadcb491037830cfd426ccf2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 10 Mar 2026 01:13:18 +0800
Subject: [PATCH 1/8] Precommit tests
---
.../RISCV/rvv/fixed-vectors-cttz-elts.ll | 2190 +++++++++++++++++
1 file changed, 2190 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
index 632c9a5a75911..1d1b1a02746ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
@@ -42,6 +42,2196 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
ret i16 %res
}
+define i16 @ctz_v2048i1(<2048 x i1> %a) {
+; RV32-LABEL: ctz_v2048i1:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xef, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 111 * vlenb
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 5
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv1r.v v0, v22
+; RV32-NEXT: li a0, 64
+; RV32-NEXT: li a1, 128
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v16, v8, -1, v0
+; RV32-NEXT: vid.v v8
+; RV32-NEXT: vrsub.vx v24, v8, a1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: li a1, 1152
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vrsub.vx v0, v8, a1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vmaxu.vv v16, v24, v16
+; RV32-NEXT: li a1, 640
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vrsub.vx v0, v8, a1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: li a1, 1664
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a1
+; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: li a1, 384
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vmaxu.vv v8, v8, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: li a1, 1408
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a1
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: li a1, 896
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a1
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: li a1, 1920
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmerge.vim v16, v24, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vmaxu.vv v16, v16, v8
+; RV32-NEXT: li a1, 256
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a1
+; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: li a1, 1280
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v16, 0
+; RV32-NEXT: vmerge.vim v16, v16, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a1
+; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v8, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmaxu.vv v16, v16, v24
+; RV32-NEXT: li a1, 768
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a1
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: li a1, 1792
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vmaxu.vv v8, v24, v8
+; RV32-NEXT: li a1, 512
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vmaxu.vv v8, v8, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: li a1, 1536
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v5, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv1r.v v0, v5
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a1
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: li a1, 1024
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v6, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv1r.v v0, v6
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a1
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: slli a1, a1, 11
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a3, a2, 5
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vmerge.vim v16, v24, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 192
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v16, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1216
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v16, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 704
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1728
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 448
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v5, 8
+; RV32-NEXT: li a2, 1472
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v16, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v6, 8
+; RV32-NEXT: li a2, 960
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vrsub.vx v16, v24, a2
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v7, 8
+; RV32-NEXT: li a2, 1984
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmaxu.vv v16, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 320
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v16, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1344
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v16, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 832
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a2
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1856
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmaxu.vv v16, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 576
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v8, v8, -1, v0
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1600
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: vid.v v16
+; RV32-NEXT: vrsub.vx v16, v16, a2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v16, v8, v16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v8, 8
+; RV32-NEXT: li a2, 1088
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vmerge.vim v8, v24, -1, v0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a4, a4, a3
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl1r.v v24, (a3) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v0, v24, 8
+; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV32-NEXT: vid.v v24
+; RV32-NEXT: vrsub.vx v24, v24, a2
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vmv.v.i v24, 0
+; RV32-NEXT: vmerge.vim v24, v24, -1, v0
+; RV32-NEXT: vid.v v0
+; RV32-NEXT: vrsub.vx v0, v0, a0
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a2, a2, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmaxu.vv v8, v16, v8
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add sp, sp, a1
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctz_v2048i1:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xef, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 111 * vlenb
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 5
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv1r.v v0, v22
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v16, v8, -1, v0
+; RV64-NEXT: vid.v v8
+; RV64-NEXT: vrsub.vx v24, v8, a1
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: li a1, 1152
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vrsub.vx v0, v8, a1
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: vmaxu.vv v16, v24, v16
+; RV64-NEXT: li a1, 640
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vrsub.vx v0, v8, a1
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: li a1, 1664
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a1
+; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vmaxu.vv v8, v8, v24
+; RV64-NEXT: li a1, 384
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a1
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: vmaxu.vv v8, v8, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: li a1, 1408
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a1
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vmaxu.vv v8, v8, v24
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: li a1, 896
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a1
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: li a1, 1920
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmerge.vim v16, v24, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a1
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: vmaxu.vv v16, v16, v8
+; RV64-NEXT: li a1, 256
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a1
+; RV64-NEXT: vand.vv v24, v8, v24
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: li a1, 1280
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v16, 0
+; RV64-NEXT: vmerge.vim v16, v16, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a1
+; RV64-NEXT: vand.vv v16, v16, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v8, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vmaxu.vv v16, v16, v24
+; RV64-NEXT: li a1, 768
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a1
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: li a1, 1792
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a1
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: vmaxu.vv v8, v24, v8
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a1
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: vmaxu.vv v8, v8, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: li a1, 1536
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v5, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv1r.v v0, v5
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a1
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vmaxu.vv v8, v8, v24
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: li a1, 1024
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v6, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv1r.v v0, v6
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a1
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: slli a1, a1, 11
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a3, a2, 5
+; RV64-NEXT: add a2, a3, a2
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: vmerge.vim v16, v24, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a1
+; RV64-NEXT: vand.vv v16, v16, v24
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 192
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a2
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v16, v8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1216
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v16, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 704
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1728
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a2
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 448
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v5, 8
+; RV64-NEXT: li a2, 1472
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v16, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v6, 8
+; RV64-NEXT: li a2, 960
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vrsub.vx v16, v24, a2
+; RV64-NEXT: vand.vv v16, v8, v16
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v7, 8
+; RV64-NEXT: li a2, 1984
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vmaxu.vv v16, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 320
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v16, v8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1344
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v16, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 832
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a2
+; RV64-NEXT: vand.vv v16, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1856
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vmaxu.vv v16, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 576
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vim v8, v8, -1, v0
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1600
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: vrsub.vx v16, v16, a2
+; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v16, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: mv a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a3, a3, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v8, 8
+; RV64-NEXT: li a2, 1088
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vmerge.vim v8, v24, -1, v0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: mv a4, a3
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: add a4, a4, a3
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl1r.v v24, (a3) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v0, v24, 8
+; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; RV64-NEXT: vid.v v24
+; RV64-NEXT: vrsub.vx v24, v24, a2
+; RV64-NEXT: vand.vv v8, v8, v24
+; RV64-NEXT: vmv.v.i v24, 0
+; RV64-NEXT: vmerge.vim v24, v24, -1, v0
+; RV64-NEXT: vid.v v0
+; RV64-NEXT: vrsub.vx v0, v0, a0
+; RV64-NEXT: vand.vv v24, v24, v0
+; RV64-NEXT: vmaxu.vv v8, v8, v24
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: mv a2, a0
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add a2, a2, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmaxu.vv v8, v16, v8
+; RV64-NEXT: vredmaxu.vs v8, v8, v8
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: add a2, a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add sp, sp, a1
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %res = call i16 @llvm.experimental.cttz.elts(<2048 x i1> %a, i1 0)
+ ret i16 %res
+}
+
; ZERO IS POISON
define i32 @ctz_v2i1_poison(<2 x i1> %a) {
>From e7a2022718906c9641f7c4cab0b24f86e976868a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 10 Mar 2026 15:51:48 +0800
Subject: [PATCH 2/8] [SDAG] Expand CTTZ_ELTS[_ZERO_POISON] in SelectionDAG
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 10 +-
llvm/include/llvm/CodeGen/ISDOpcodes.h | 2 +-
llvm/include/llvm/CodeGen/TargetLowering.h | 8 +-
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorOps.cpp | 6 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 24 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 47 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 63 +-
llvm/lib/CodeGen/TargetLoweringBase.cpp | 4 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 19 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 -
.../AArch64/AArch64TargetTransformInfo.cpp | 7 +-
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 -
llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 -
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 10 +-
.../Analysis/CostModel/AArch64/cttz_elts.ll | 34 +-
.../Analysis/CostModel/RISCV/cttz_elts.ll | 16 +-
.../AArch64/intrinsic-cttz-elts-sve.ll | 131 +-
.../CodeGen/AArch64/intrinsic-cttz-elts.ll | 6 +-
.../CodeGen/AArch64/sve-mask-partition.ll | 256 +-
llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll | 115 +-
.../RISCV/rvv/fixed-vectors-cttz-elts.ll | 2594 +++--------------
22 files changed, 656 insertions(+), 2706 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 7812a301efbd7..803c6fbf93978 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2115,15 +2115,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
case Intrinsic::experimental_cttz_elts: {
+ EVT RetType = getTLI()->getValueType(DL, ICA.getReturnType(), true);
EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
// If we're not expanding the intrinsic then we assume this is cheap
// to implement.
- if (!getTLI()->shouldExpandCttzElements(ArgType))
- return getTypeLegalizationCost(RetTy).first;
+ auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ if (getTLI()->isOperationLegalOrCustom(ISD::CTTZ_ELTS, LT.second))
+ return LT.first;
// TODO: The costs below reflect the expansion code in
- // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
+ // TargetLowering, but we may want to sacrifice some accuracy in
// favour of compile time.
// Find the smallest "sensible" element type to use for the expansion.
@@ -2133,7 +2135,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
VScaleRange = getVScaleRange(I->getCaller(), 64);
unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
- RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
+ RetType, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
// Create the new vector type & get the vector length
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index eac6faceafd0c..efe1e7e41da19 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1580,7 +1580,7 @@ enum NodeType {
EXPERIMENTAL_VECTOR_HISTOGRAM,
/// Returns the number of number of trailing (least significant) zero elements
- /// in a vector. Has a single i1 vector operand. The result is poison if the
+ /// in a vector. Has a single vector operand. The result is poison if the
/// return type isn't wide enough to hold the maximum number of elements in
/// the input vector.
CTTZ_ELTS,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 51c00b2591ecf..832df3ea26f59 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -492,13 +492,9 @@ class LLVM_ABI TargetLoweringBase {
return true;
}
- /// Return true if the @llvm.experimental.cttz.elts intrinsic should be
- /// expanded using generic code in SelectionDAGBuilder.
- virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
-
/// Return the minimum number of bits required to hold the maximum possible
/// number of trailing zero vector elements.
- unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC,
+ unsigned getBitWidthForCttzElements(EVT RetVT, ElementCount EC,
bool ZeroIsPoison,
const ConstantRange *VScaleRange) const;
@@ -5820,6 +5816,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// temporarily, advance store position, before re-loading the final vector.
SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+ SDValue expandCttzElts(SDNode *Node, SelectionDAG &DAG) const;
+
/// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations,
/// consisting of zext/sext, extract_subvector, mul and add operations.
SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 14f361f8bcaed..ff5280d219c5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -987,6 +987,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N);
SDValue SplitVecOp_CMP(SDNode *N);
SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N);
+ SDValue SplitVecOp_CttzElts(SDNode *N);
SDValue SplitVecOp_VP_CttzElements(SDNode *N);
SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N);
SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2409a1f31e26e..03396ba016153 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -517,6 +517,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMINIMUM:
case ISD::VECREDUCE_FMUL:
+ case ISD::CTTZ_ELTS:
+ case ISD::CTTZ_ELTS_ZERO_POISON:
case ISD::VECTOR_FIND_LAST_ACTIVE:
Action = TLI.getOperationAction(Node->getOpcode(),
Node->getOperand(0).getValueType());
@@ -1354,6 +1356,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VECTOR_COMPRESS:
Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
return;
+ case ISD::CTTZ_ELTS:
+ case ISD::CTTZ_ELTS_ZERO_POISON:
+ Results.push_back(TLI.expandCttzElts(Node, DAG));
+ return;
case ISD::VECTOR_FIND_LAST_ACTIVE:
Results.push_back(TLI.expandVectorFindLastActive(Node, DAG));
return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 564bf3b7f152e..7af934635b51b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3742,6 +3742,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VP_REDUCE_FMINIMUM:
Res = SplitVecOp_VP_REDUCE(N, OpNo);
break;
+ case ISD::CTTZ_ELTS:
+ case ISD::CTTZ_ELTS_ZERO_POISON:
+ Res = SplitVecOp_CttzElts(N);
+ break;
case ISD::VP_CTTZ_ELTS:
case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
Res = SplitVecOp_VP_CttzElements(N);
@@ -4828,6 +4832,26 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_TO_XINT_SAT(SDNode *N) {
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
+SDValue DAGTypeLegalizer::SplitVecOp_CttzElts(SDNode *N) {
+ SDLoc DL(N);
+ EVT ResVT = N->getValueType(0);
+
+ SDValue Lo, Hi;
+ SDValue VecOp = N->getOperand(0);
+ GetSplitVector(VecOp, Lo, Hi);
+
+ // if CTTZ_ELTS(Lo) != VL => CTTZ_ELTS(Lo).
+ // else => VL + (CTTZ_ELTS(Hi) or CTTZ_ELTS_ZERO_UNDEF(Hi)).
+ SDValue ResLo = DAG.getNode(ISD::CTTZ_ELTS, DL, ResVT, Lo);
+ SDValue VL =
+ DAG.getElementCount(DL, ResVT, Lo.getValueType().getVectorElementCount());
+ SDValue ResLoNotVL =
+ DAG.getSetCC(DL, getSetCCResultType(ResVT), ResLo, VL, ISD::SETNE);
+ SDValue ResHi = DAG.getNode(N->getOpcode(), DL, ResVT, Hi);
+ return DAG.getSelect(DL, ResVT, ResLoNotVL, ResLo,
+ DAG.getNode(ISD::ADD, DL, ResVT, VL, ResHi));
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) {
SDLoc DL(N);
EVT ResVT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 04b17b56b3d49..5ca4f3bc3a9eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8326,53 +8326,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::experimental_cttz_elts: {
auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
- EVT OpVT = Op.getValueType();
EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
bool ZeroIsPoison =
!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero();
-
- if (!TLI.shouldExpandCttzElements(OpVT)) {
- SDValue Ret = DAG.getNode(ZeroIsPoison ? ISD::CTTZ_ELTS_ZERO_POISON
- : ISD::CTTZ_ELTS,
- sdl, RetTy, Op);
- setValue(&I, Ret);
- return;
- }
-
- if (OpVT.getScalarType() != MVT::i1) {
- // Compare the input vector elements to zero & use to count trailing zeros
- SDValue AllZero = DAG.getConstant(0, DL, OpVT);
- OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- OpVT.getVectorElementCount());
- Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
- }
-
- // If the zero-is-poison flag is set, we can assume the upper limit
- // of the result is VF-1.
- ConstantRange VScaleRange(1, true); // Dummy value.
- if (isa<ScalableVectorType>(I.getOperand(0)->getType()))
- VScaleRange = getVScaleRange(I.getCaller(), 64);
- unsigned EltWidth = TLI.getBitWidthForCttzElements(
- I.getType(), OpVT.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
-
- MVT NewEltTy = MVT::getIntegerVT(EltWidth);
-
- // Create the new vector type & get the vector length
- EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy,
- OpVT.getVectorElementCount());
-
- SDValue VL =
- DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount());
-
- SDValue StepVec = DAG.getStepVector(DL, NewVT);
- SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
- SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
- SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op);
- SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
- SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
- SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
-
- SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
+ SDValue Ret =
+ DAG.getNode(ZeroIsPoison ? ISD::CTTZ_ELTS_ZERO_POISON : ISD::CTTZ_ELTS,
+ sdl, RetTy, Op);
setValue(&I, Ret);
return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f3873c1996f3d..24fbe5029e80e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10021,9 +10021,9 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
if (MaskVT.isScalableVector())
VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- uint64_t EltWidth = TLI.getBitWidthForCttzElements(
- BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(),
- /*ZeroIsPoison=*/true, &VScaleRange);
+ uint64_t EltWidth =
+ TLI.getBitWidthForCttzElements(BoolVT, MaskVT.getVectorElementCount(),
+ /*ZeroIsPoison=*/true, &VScaleRange);
// If the step vector element type is smaller than the mask element type,
// use the mask type directly to avoid widening issues.
EltWidth = std::max(EltWidth, BoolVT.getFixedSizeInBits());
@@ -12491,6 +12491,63 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
}
+SDValue TargetLowering::expandCttzElts(SDNode *Node, SelectionDAG &DAG) const {
+ SDLoc DL(Node);
+ EVT VT = Node->getValueType(0);
+ SDValue Op = Node->getOperand(0);
+ EVT OpVT = Op.getValueType();
+
+ if (OpVT.getVectorElementType() != MVT::i1) {
+ // Compare the input vector elements to zero & use to count trailing zeros.
+ SDValue AllZero = DAG.getConstant(0, DL, OpVT);
+ EVT I1OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ OpVT.getVectorElementCount());
+ // If cttz_elts is legal for the i1 type, use it instead of expanding.
+ if (isOperationLegalOrCustom(Node->getOpcode(), I1OpVT)) {
+ Op = DAG.getSetCC(DL, I1OpVT, Op, AllZero, ISD::SETNE);
+ return DAG.getNode(Node->getOpcode(), DL, VT, Op);
+ }
+
+ Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
+ }
+
+ // If the zero-is-poison flag is set, we can assume the upper limit
+ // of the result is VF-1.
+ bool ZeroIsPoison = Node->getOpcode() == ISD::CTTZ_ELTS_ZERO_POISON;
+ ConstantRange VScaleRange(1, true); // Dummy value.
+ if (OpVT.isScalableVector())
+ VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
+ unsigned EltWidth = getBitWidthForCttzElements(
+ VT, OpVT.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
+
+ EVT NewEltVT = MVT::getIntegerVT(EltWidth);
+
+ // Create the new vector type & get the vector length
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT,
+ OpVT.getVectorElementCount());
+
+ // Promote types now to avoid redundant zexts.
+ if (getTypeAction(NewVT.getSimpleVT()) == TypePromoteInteger) {
+ NewVT = getTypeToTransformTo(*DAG.getContext(), NewVT);
+ NewEltVT = NewVT.getVectorElementType();
+ }
+ if (getTypeAction(NewEltVT.getSimpleVT()) == TypePromoteInteger)
+ NewEltVT = getTypeToTransformTo(*DAG.getContext(), NewEltVT);
+
+ SDValue VL = DAG.getElementCount(DL, NewEltVT, NewVT.getVectorElementCount());
+
+ SDValue StepVec = DAG.getStepVector(DL, NewVT);
+ SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
+ SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
+ SDValue Ext = DAG.getSExtOrTrunc(Op, DL, NewVT);
+ SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
+ SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltVT, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltVT,
+ DAG.getZExtOrTrunc(VL, DL, NewEltVT), Max);
+
+ return DAG.getZExtOrTrunc(Sub, DL, VT);
+}
+
SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
SelectionDAG &DAG) const {
SDLoc DL(N);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b6d5a4c22e133..b4dc9afae725c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1347,7 +1347,7 @@ bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
}
unsigned TargetLoweringBase::getBitWidthForCttzElements(
- Type *RetTy, ElementCount EC, bool ZeroIsPoison,
+ EVT RetVT, ElementCount EC, bool ZeroIsPoison,
const ConstantRange *VScaleRange) const {
// Find the smallest "sensible" element type to use for the expansion.
ConstantRange CR(APInt(64, EC.getKnownMinValue()));
@@ -1357,7 +1357,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements(
if (ZeroIsPoison)
CR = CR.subtract(APInt(64, 1));
- unsigned EltWidth = RetTy->getScalarSizeInBits();
+ unsigned EltWidth = RetVT.getScalarSizeInBits();
EltWidth = std::min(EltWidth, CR.getActiveBits());
EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 38db1ac4a2fb9..babfb3051ab7b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2135,6 +2135,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ // Set to custom so we can expand cttz.elts during type legalization for NEON
+ for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
+ setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON}, VT,
+ Custom);
+
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
// Only required for llvm.aarch64.mops.memset.tag
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
@@ -2338,17 +2343,6 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
return false;
}
-bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
- if (!Subtarget->isSVEorStreamingSVEAvailable())
- return true;
-
- // We can only use the BRKB + CNTP sequence with legal predicate types. We can
- // also support fixed-width predicates.
- return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
- VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
- VT != MVT::v4i1 && VT != MVT::v2i1;
-}
-
bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
unsigned SearchSize) const {
// MATCH is SVE2 and only available in non-streaming mode.
@@ -8492,6 +8486,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
EVT VT = CttzOp.getValueType();
assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
+ if (!Subtarget->isSVEorStreamingSVEAvailable())
+ return expandCttzElts(Op.getNode(), DAG);
+
if (VT.isFixedLengthVector()) {
// We can use SVE instructions to lower this intrinsic by first creating
// an SVE predicate register mask from the fixed-width vector.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 49ff76bb2f469..07b80b2c50a40 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -553,8 +553,6 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
- bool shouldExpandCttzElements(EVT VT) const override;
-
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;
/// If a change in streaming mode is required on entry to/return from a
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 107ab9fdf3f9c..3d6eeedcc0ade 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1077,12 +1077,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
break;
}
case Intrinsic::experimental_cttz_elts: {
- EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
- if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
+ Type *ArgTy = ICA.getArgTypes()[0];
+
+ if (ST->isSVEorStreamingSVEAvailable()) {
// This will consist of a SVE brkb and a cntp instruction. These
// typically have the same latency and half the throughput as a vector
// add instruction.
- return 4;
+ return getTypeLegalizationCost(ArgTy).first * 4;
}
break;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7c1eacbce3701..0758764f5f543 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2014,11 +2014,6 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
return VF > MaxVF || !isPowerOf2_32(VF);
}
-bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const {
- return !Subtarget.hasVInstructions() ||
- VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
-}
-
void RISCVTargetLowering::getTgtMemIntrinsic(
SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,
MachineFunction &MF, unsigned Intrinsic) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 8d88aeb7ae3fc..cd2609d8b604b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -120,8 +120,6 @@ class RISCVTargetLowering : public TargetLowering {
shouldExpandBuildVectorWithShuffles(EVT VT,
unsigned DefinedValues) const override;
- bool shouldExpandCttzElements(EVT VT) const override;
-
/// Return the cost of LMUL for linear operations.
InstructionCost getLMULCost(MVT VT) const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 8f09e25bbfc23..fee3173c54ada 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1656,11 +1656,11 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
case Intrinsic::experimental_cttz_elts: {
Type *ArgTy = ICA.getArgTypes()[0];
- EVT ArgType = TLI->getValueType(DL, ArgTy, true);
- if (getTLI()->shouldExpandCttzElements(ArgType))
+ auto LT = getTypeLegalizationCost(ArgTy);
+ if (!LT.second.isVector())
break;
- InstructionCost Cost = getRISCVInstructionCost(
- RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
+ InstructionCost Cost = LT.first * getRISCVInstructionCost(
+ RISCV::VFIRST_M, LT.second, CostKind);
// If zero_is_poison is false, then we will generate additional
// cmp + select instructions to convert -1 to EVL.
@@ -1733,7 +1733,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// Find a suitable type for a stepvector.
ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
- MaskTy->getScalarType(), MaskTy->getElementCount(),
+ MaskLT.second, MaskTy->getElementCount(),
/*ZeroIsPoison=*/true, &VScaleRange);
EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index 3bd929db1052a..2bcf4e6b1e153 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -3,47 +3,47 @@
define void @foo_no_vscale_range() {
; CHECK-LABEL: 'foo_no_vscale_range'
-; CHECK-NEXT: Cost Model: Found costs of Invalid for: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:66 Lat:66 SizeLat:66 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:66 Lat:66 SizeLat:66 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
@@ -101,22 +101,22 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -150,22 +150,22 @@ define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
diff --git a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
index 094d73ddd0581..392206feb17c1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
@@ -9,28 +9,28 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 661 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 334 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 661 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 334 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -75,28 +75,28 @@ define void @foo_vscale_range_2_16() vscale_range(2,16) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 33e7c69f041d4..38ea26a4fb287 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -5,40 +5,31 @@
; WITH VSCALE RANGE
define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
-; CHECK-LABEL: ctz_nxv32i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.h, #0, #-1
-; CHECK-NEXT: cnth x8
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: punpklo p3.h, p1.b
-; CHECK-NEXT: rdvl x9, #2
-; CHECK-NEXT: mov z1.h, w8
-; CHECK-NEXT: rdvl x8, #-1
-; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: mov z2.h, w8
-; CHECK-NEXT: punpkhi p1.h, p1.b
-; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: inch z0.h, all, mul #4
-; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: add z1.h, z0.h, z1.h
-; CHECK-NEXT: add z4.h, z0.h, z2.h
-; CHECK-NEXT: and z0.d, z0.d, z3.d
-; CHECK-NEXT: add z2.h, z1.h, z2.h
-; CHECK-NEXT: and z3.d, z4.d, z5.d
-; CHECK-NEXT: and z1.d, z1.d, z6.d
-; CHECK-NEXT: and z2.d, z2.d, z7.d
-; CHECK-NEXT: umax z0.h, p0/m, z0.h, z3.h
-; CHECK-NEXT: umax z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: umaxv h0, p0, z0.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and w0, w8, #0xffff
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_nxv32i1:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: ptrue p2.b
+; NONSTREAMING-NEXT: rdvl x8, #1
+; NONSTREAMING-NEXT: mov w10, w8
+; NONSTREAMING-NEXT: brkb p0.b, p2/z, p0.b
+; NONSTREAMING-NEXT: brkb p1.b, p2/z, p1.b
+; NONSTREAMING-NEXT: cntp x9, p0, p0.b
+; NONSTREAMING-NEXT: incp x8, p1.b
+; NONSTREAMING-NEXT: cmp w9, w10
+; NONSTREAMING-NEXT: csel w0, w9, w8, ne
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_nxv32i1:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: ptrue p2.b
+; STREAMING-NEXT: rdvl x10, #1
+; STREAMING-NEXT: brkb p1.b, p2/z, p1.b
+; STREAMING-NEXT: brkb p0.b, p2/z, p0.b
+; STREAMING-NEXT: cntp x8, p1, p1.b
+; STREAMING-NEXT: cntp x9, p0, p0.b
+; STREAMING-NEXT: incb x8
+; STREAMING-NEXT: cmp w9, w10
+; STREAMING-NEXT: csel w0, w9, w8, ne
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> %a, i1 0)
ret i32 %res
}
@@ -47,17 +38,9 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: ctz_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: cntw x9
; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT: index z0.s, #0, #-1
-; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: incw z0.s
-; CHECK-NEXT: and z0.d, z0.d, z1.d
-; CHECK-NEXT: and z0.s, z0.s, #0xff
-; CHECK-NEXT: umaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: cntp x0, p0, p0.s
; CHECK-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
ret i32 %res
@@ -69,40 +52,9 @@ define i64 @vscale_4096(<vscale x 16 x i8> %a) #1 {
; CHECK-LABEL: vscale_4096:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: index z1.s, #0, #-1
-; CHECK-NEXT: cntw x8
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: mov z0.s, w8
-; CHECK-NEXT: cnth x8
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: incw z1.s, all, mul #4
-; CHECK-NEXT: mov z2.s, w8
-; CHECK-NEXT: punpklo p1.h, p0.b
-; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: add z4.s, z1.s, z2.s
-; CHECK-NEXT: punpkhi p2.h, p1.b
-; CHECK-NEXT: punpkhi p3.h, p0.b
-; CHECK-NEXT: punpklo p0.h, p0.b
-; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: add z2.s, z0.s, z2.s
-; CHECK-NEXT: punpklo p1.h, p1.b
-; CHECK-NEXT: mov z5.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: and z0.d, z0.d, z3.d
-; CHECK-NEXT: and z2.d, z2.d, z5.d
-; CHECK-NEXT: and z3.d, z4.d, z6.d
-; CHECK-NEXT: and z1.d, z1.d, z7.d
-; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: umaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 0)
ret i64 %res
@@ -112,26 +64,9 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
; CHECK-LABEL: vscale_4096_poison:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: index z1.h, #0, #-1
-; CHECK-NEXT: cnth x8
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: mov z0.h, w8
-; CHECK-NEXT: inch z1.h, all, mul #2
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: punpklo p0.h, p0.b
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: and z0.d, z0.d, z2.d
-; CHECK-NEXT: and z1.d, z1.d, z3.d
-; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: umaxv h0, p0, z0.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 1)
ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
index a7ffefdecb5f7..1bf000c07edbc 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
@@ -56,8 +56,7 @@ define i32 @ctz_v16i1(<16 x i1> %a) {
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: sub w0, w9, w8
; CHECK-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
ret i32 %res
@@ -79,8 +78,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: umaxv h0, v0.4h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: sub w0, w9, w8
; CHECK-NEXT: ret
%res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
ret i16 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
index e4bad94f08b45..9aa673ee6cce8 100644
--- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -223,163 +223,17 @@ define <2 x i1> @mask_include_active_v2(<2 x i1> %mask.in) {
define <vscale x 32 x i1> @mask_exclude_active_nxv32(<vscale x 32 x i1> %mask.in) {
; CHECK-LABEL: mask_exclude_active_nxv32:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-9
-; CHECK-NEXT: str p11, [sp] // 2-byte Spill
-; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Spill
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Spill
-; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc8, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
-; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
-; CHECK-NEXT: index z2.d, #0, #-1
-; CHECK-NEXT: cnth x8
-; CHECK-NEXT: punpkhi p5.h, p0.b
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: punpkhi p4.h, p1.b
-; CHECK-NEXT: cntw x9
-; CHECK-NEXT: mov z0.d, x8
-; CHECK-NEXT: punpklo p3.h, p5.b
-; CHECK-NEXT: rdvl x8, #-1
-; CHECK-NEXT: punpklo p2.h, p4.b
-; CHECK-NEXT: mov z1.d, x8
-; CHECK-NEXT: neg x8, x9
-; CHECK-NEXT: incd z2.d, all, mul #16
-; CHECK-NEXT: punpklo p10.h, p0.b
-; CHECK-NEXT: mov z5.d, x8
-; CHECK-NEXT: punpklo p9.h, p3.b
-; CHECK-NEXT: cntd x8
-; CHECK-NEXT: rdvl x9, #2
-; CHECK-NEXT: punpklo p1.h, p1.b
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: add z4.d, z2.d, z0.d
-; CHECK-NEXT: punpklo p8.h, p2.b
-; CHECK-NEXT: mov z7.d, p9/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: punpklo p6.h, p10.b
-; CHECK-NEXT: mov z28.d, x8
-; CHECK-NEXT: add z25.d, z2.d, z5.d
-; CHECK-NEXT: punpklo p7.h, p1.b
-; CHECK-NEXT: mov z3.d, p8/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: add z6.d, z4.d, z1.d
-; CHECK-NEXT: punpklo p8.h, p6.b
-; CHECK-NEXT: and z4.d, z4.d, z7.d
-; CHECK-NEXT: punpkhi p0.h, p1.b
-; CHECK-NEXT: add z28.d, z2.d, z28.d
-; CHECK-NEXT: add z26.d, z25.d, z0.d
-; CHECK-NEXT: punpkhi p1.h, p10.b
-; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Reload
-; CHECK-NEXT: mov z7.d, p8/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: punpklo p11.h, p7.b
-; CHECK-NEXT: and z3.d, z6.d, z3.d
-; CHECK-NEXT: add z6.d, z2.d, z1.d
-; CHECK-NEXT: punpklo p9.h, p0.b
-; CHECK-NEXT: add z29.d, z25.d, z1.d
-; CHECK-NEXT: add z5.d, z28.d, z5.d
-; CHECK-NEXT: punpklo p8.h, p1.b
-; CHECK-NEXT: mov z24.d, p11/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p11, [sp] // 2-byte Reload
-; CHECK-NEXT: punpkhi p5.h, p5.b
-; CHECK-NEXT: mov z27.d, p9/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: add z31.d, z26.d, z1.d
-; CHECK-NEXT: punpkhi p4.h, p4.b
-; CHECK-NEXT: mov z30.d, p8/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: and z2.d, z2.d, z7.d
-; CHECK-NEXT: punpklo p9.h, p5.b
-; CHECK-NEXT: and z6.d, z6.d, z24.d
-; CHECK-NEXT: add z12.d, z5.d, z1.d
-; CHECK-NEXT: punpklo p8.h, p4.b
-; CHECK-NEXT: and z7.d, z29.d, z27.d
-; CHECK-NEXT: add z29.d, z28.d, z0.d
-; CHECK-NEXT: mov z24.d, p9/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Reload
-; CHECK-NEXT: punpkhi p3.h, p3.b
-; CHECK-NEXT: mov z8.d, p8/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT: punpkhi p2.h, p2.b
-; CHECK-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEXT: punpkhi p7.h, p7.b
-; CHECK-NEXT: and z25.d, z25.d, z30.d
-; CHECK-NEXT: punpkhi p6.h, p6.b
-; CHECK-NEXT: and z24.d, z26.d, z24.d
-; CHECK-NEXT: mov z10.d, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: and z26.d, z31.d, z8.d
-; CHECK-NEXT: punpkhi p1.h, p1.b
-; CHECK-NEXT: mov z8.d, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: add z27.d, z28.d, z1.d
-; CHECK-NEXT: mov z30.d, p7/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT: punpkhi p3.h, p5.b
-; CHECK-NEXT: mov z31.d, p6/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT: punpkhi p2.h, p4.b
-; CHECK-NEXT: add z9.d, z29.d, z1.d
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT: mov z11.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z13.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT: mov z14.d, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: add z1.d, z0.d, z1.d
-; CHECK-NEXT: mov z15.d, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: and z27.d, z27.d, z30.d
-; CHECK-NEXT: and z28.d, z28.d, z31.d
-; CHECK-NEXT: and z29.d, z29.d, z8.d
-; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: and z30.d, z9.d, z10.d
-; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: and z5.d, z5.d, z11.d
-; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: and z31.d, z12.d, z13.d
-; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: and z0.d, z0.d, z14.d
-; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: and z1.d, z1.d, z15.d
-; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: umax z3.d, p0/m, z3.d, z4.d
-; CHECK-NEXT: umax z2.d, p0/m, z2.d, z6.d
-; CHECK-NEXT: umax z7.d, p0/m, z7.d, z25.d
-; CHECK-NEXT: umax z24.d, p0/m, z24.d, z26.d
-; CHECK-NEXT: umax z27.d, p0/m, z27.d, z28.d
-; CHECK-NEXT: umax z29.d, p0/m, z29.d, z30.d
-; CHECK-NEXT: umax z5.d, p0/m, z5.d, z31.d
-; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: umax z2.d, p0/m, z2.d, z3.d
-; CHECK-NEXT: umax z7.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: umax z27.d, p0/m, z27.d, z29.d
-; CHECK-NEXT: umax z0.d, p0/m, z0.d, z5.d
-; CHECK-NEXT: umax z2.d, p0/m, z2.d, z7.d
-; CHECK-NEXT: umax z0.d, p0/m, z0.d, z27.d
-; CHECK-NEXT: umax z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT: umaxv d0, p0, z0.d
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: sub x8, x9, x8
-; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: whilelo p0.b, xzr, x8
-; CHECK-NEXT: whilelo p1.b, x9, x8
-; CHECK-NEXT: addvl sp, sp, #9
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ptrue p2.b
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, x8
+; CHECK-NEXT: brkb p0.b, p2/z, p0.b
+; CHECK-NEXT: brkb p1.b, p2/z, p1.b
+; CHECK-NEXT: cntp x10, p0, p0.b
+; CHECK-NEXT: incp x9, p1.b
+; CHECK-NEXT: cmp x10, x8
+; CHECK-NEXT: csel x9, x10, x9, ne
+; CHECK-NEXT: whilelo p0.b, xzr, x9
+; CHECK-NEXT: whilelo p1.b, x8, x9
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> %mask.in, i1 false)
%mask.out = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %tz.elts)
@@ -392,16 +246,13 @@ define <32 x i1> @mask_exclude_active_v32(<32 x i1> %mask.in) {
; CHECK-NEXT: ldr w9, [sp, #64]
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldr w10, [sp, #72]
-; CHECK-NEXT: index z2.b, #0, #-1
+; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: ldr w9, [sp, #80]
; CHECK-NEXT: mov v0.b[1], w1
; CHECK-NEXT: mov v1.b[1], w10
; CHECK-NEXT: ldr w10, [sp, #128]
-; CHECK-NEXT: mov z3.d, z2.d
-; CHECK-NEXT: add z2.b, z2.b, #32 // =0x20
; CHECK-NEXT: mov v0.b[2], w2
-; CHECK-NEXT: add z3.b, z3.b, #16 // =0x10
; CHECK-NEXT: mov v1.b[2], w9
; CHECK-NEXT: ldr w9, [sp, #88]
; CHECK-NEXT: mov v0.b[3], w3
@@ -448,72 +299,71 @@ define <32 x i1> @mask_exclude_active_v32(<32 x i1> %mask.in) {
; CHECK-NEXT: mov v1.b[14], w10
; CHECK-NEXT: ldr w10, [sp, #184]
; CHECK-NEXT: mov v0.b[15], w9
-; CHECK-NEXT: mov w9, #32 // =0x20
; CHECK-NEXT: mov v1.b[15], w10
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: cmpne p2.b, p0/z, z1.b, #0
+; CHECK-NEXT: brkb p1.b, p0/z, p1.b
+; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z6.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: umax v2.16b, v2.16b, v1.16b
-; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: brkb p0.b, p0/z, p2.b
; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: cntp x9, p1, p1.b
+; CHECK-NEXT: mov z16.d, z0.d
; CHECK-NEXT: mov z17.d, z0.d
+; CHECK-NEXT: cntp x10, p0, p0.b
; CHECK-NEXT: mov z18.d, z0.d
; CHECK-NEXT: mov z19.d, z0.d
; CHECK-NEXT: mov z20.d, z0.d
; CHECK-NEXT: mov z21.d, z0.d
-; CHECK-NEXT: umaxv b16, v2.16b
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z22.d, z0.d
-; CHECK-NEXT: mov z23.d, z0.d
; CHECK-NEXT: add z1.d, z1.d, #14 // =0xe
; CHECK-NEXT: add z3.d, z3.d, #12 // =0xc
; CHECK-NEXT: add z6.d, z6.d, #10 // =0xa
+; CHECK-NEXT: cmp x9, #16
; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
; CHECK-NEXT: add z2.d, z2.d, #6 // =0x6
+; CHECK-NEXT: add x10, x10, #16
+; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
-; CHECK-NEXT: add z17.d, z17.d, #30 // =0x1e
-; CHECK-NEXT: fmov w10, s16
-; CHECK-NEXT: add z18.d, z18.d, #28 // =0x1c
-; CHECK-NEXT: add z19.d, z19.d, #26 // =0x1a
-; CHECK-NEXT: add z20.d, z20.d, #24 // =0x18
-; CHECK-NEXT: add z21.d, z21.d, #22 // =0x16
-; CHECK-NEXT: add z22.d, z22.d, #20 // =0x14
-; CHECK-NEXT: add z23.d, z23.d, #18 // =0x12
-; CHECK-NEXT: sub w9, w9, w10
-; CHECK-NEXT: and x9, x9, #0xff
-; CHECK-NEXT: dup v16.2d, x9
+; CHECK-NEXT: csel x9, x9, x10, ne
+; CHECK-NEXT: add z16.d, z16.d, #30 // =0x1e
+; CHECK-NEXT: add z17.d, z17.d, #28 // =0x1c
+; CHECK-NEXT: dup v23.2d, x9
+; CHECK-NEXT: add z18.d, z18.d, #26 // =0x1a
+; CHECK-NEXT: add z19.d, z19.d, #24 // =0x18
+; CHECK-NEXT: add z20.d, z20.d, #22 // =0x16
+; CHECK-NEXT: add z21.d, z21.d, #20 // =0x14
+; CHECK-NEXT: add z22.d, z22.d, #18 // =0x12
; CHECK-NEXT: adrp x9, .LCPI17_0
-; CHECK-NEXT: cmhi v24.2d, v16.2d, v0.2d
+; CHECK-NEXT: cmhi v24.2d, v23.2d, v0.2d
; CHECK-NEXT: add z0.d, z0.d, #16 // =0x10
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v17.2d, v16.2d, v17.2d
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v18.2d
-; CHECK-NEXT: cmhi v19.2d, v16.2d, v19.2d
-; CHECK-NEXT: cmhi v20.2d, v16.2d, v20.2d
-; CHECK-NEXT: cmhi v21.2d, v16.2d, v21.2d
-; CHECK-NEXT: cmhi v22.2d, v16.2d, v22.2d
-; CHECK-NEXT: cmhi v23.2d, v16.2d, v23.2d
-; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v7.2d, v16.2d, v7.2d
+; CHECK-NEXT: cmhi v1.2d, v23.2d, v1.2d
+; CHECK-NEXT: cmhi v3.2d, v23.2d, v3.2d
+; CHECK-NEXT: cmhi v6.2d, v23.2d, v6.2d
+; CHECK-NEXT: cmhi v4.2d, v23.2d, v4.2d
+; CHECK-NEXT: cmhi v16.2d, v23.2d, v16.2d
+; CHECK-NEXT: cmhi v17.2d, v23.2d, v17.2d
+; CHECK-NEXT: cmhi v18.2d, v23.2d, v18.2d
+; CHECK-NEXT: cmhi v19.2d, v23.2d, v19.2d
+; CHECK-NEXT: cmhi v20.2d, v23.2d, v20.2d
+; CHECK-NEXT: cmhi v21.2d, v23.2d, v21.2d
+; CHECK-NEXT: cmhi v22.2d, v23.2d, v22.2d
+; CHECK-NEXT: cmhi v0.2d, v23.2d, v0.2d
+; CHECK-NEXT: cmhi v2.2d, v23.2d, v2.2d
+; CHECK-NEXT: cmhi v5.2d, v23.2d, v5.2d
+; CHECK-NEXT: cmhi v7.2d, v23.2d, v7.2d
; CHECK-NEXT: uzp1 v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: uzp1 v3.4s, v18.4s, v17.4s
-; CHECK-NEXT: uzp1 v16.4s, v20.4s, v19.4s
-; CHECK-NEXT: uzp1 v17.4s, v22.4s, v21.4s
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v23.4s
+; CHECK-NEXT: uzp1 v3.4s, v17.4s, v16.4s
+; CHECK-NEXT: uzp1 v16.4s, v19.4s, v18.4s
+; CHECK-NEXT: uzp1 v17.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v22.4s
; CHECK-NEXT: uzp1 v4.4s, v4.4s, v6.4s
; CHECK-NEXT: uzp1 v2.4s, v5.4s, v2.4s
; CHECK-NEXT: uzp1 v5.4s, v24.4s, v7.4s
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
index cdaed030e274c..fc892c1a5cae0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
@@ -7,44 +7,26 @@
define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
; RV32-LABEL: ctz_nxv4i32:
; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT: vmsne.vi v10, v8, 0
+; RV32-NEXT: vfirst.m a0, v10
+; RV32-NEXT: bgez a0, .LBB0_2
+; RV32-NEXT: # %bb.1:
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; RV32-NEXT: vid.v v10
-; RV32-NEXT: li a1, -1
-; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT: vmsne.vi v0, v8, 0
; RV32-NEXT: srli a0, a0, 1
-; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vmadd.vx v10, a1, v8
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
-; RV32-NEXT: vredmaxu.vs v8, v8, v8
-; RV32-NEXT: vmv.x.s a1, v8
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 16
+; RV32-NEXT: .LBB0_2:
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_nxv4i32:
; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT: vmsne.vi v10, v8, 0
+; RV64-NEXT: vfirst.m a0, v10
+; RV64-NEXT: bgez a0, .LBB0_2
+; RV64-NEXT: # %bb.1:
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: li a1, -1
-; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: srli a0, a0, 1
-; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmadd.vx v10, a1, v8
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vvm v8, v8, v10, v0
-; RV64-NEXT: vredmaxu.vs v8, v8, v8
-; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 48
+; RV64-NEXT: .LBB0_2:
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
ret i32 %res
@@ -55,74 +37,25 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
; RV32-LABEL: ctz_nxv8i1_no_range:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
-; RV32-NEXT: addi a0, sp, 32
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; RV32-NEXT: vmsne.vi v10, v8, 0
+; RV32-NEXT: vfirst.m a0, v10
+; RV32-NEXT: bgez a0, .LBB1_2
+; RV32-NEXT: # %bb.1:
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: srli a0, a0, 3
-; RV32-NEXT: li a2, 8
+; RV32-NEXT: .LBB1_2:
; RV32-NEXT: li a1, 0
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __muldi3
-; RV32-NEXT: sw a0, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: vid.v v8
-; RV32-NEXT: li a2, -1
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vl2r.v v24, (a3) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vmsne.vi v0, v24, 0
-; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT: vmadd.vx v8, a2, v16
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vmerge.vim v16, v16, -1, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vredmaxu.vs v8, v8, v8
-; RV32-NEXT: vmv.x.s a3, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v8, v8, a2
-; RV32-NEXT: sltu a2, a0, a3
-; RV32-NEXT: vmv.x.s a4, v8
-; RV32-NEXT: sub a1, a1, a4
-; RV32-NEXT: sub a1, a1, a2
-; RV32-NEXT: sub a0, a0, a3
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add sp, sp, a2
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_nxv8i1_no_range:
; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; RV64-NEXT: vmsne.vi v10, v8, 0
+; RV64-NEXT: vfirst.m a0, v10
+; RV64-NEXT: bgez a0, .LBB1_2
+; RV64-NEXT: # %bb.1:
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: li a1, -1
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vmsne.vi v0, v8, 0
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmadd.vx v16, a1, v8
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT: vredmaxu.vs v8, v8, v8
-; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: .LBB1_2:
; RV64-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16> %a, i1 0)
ret i64 %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
index 1d1b1a02746ec..cb91e5dfe73c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll
@@ -8,35 +8,23 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
; RV32-LABEL: ctz_v4i32:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmsne.vi v0, v8, 0
-; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v9
-; RV32-NEXT: vrsub.vi v9, v9, 4
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vredmaxu.vs v8, v8, v8
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: li a1, 4
-; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: zext.b a0, a1
+; RV32-NEXT: vmsne.vi v8, v8, 0
+; RV32-NEXT: vfirst.m a0, v8
+; RV32-NEXT: bgez a0, .LBB0_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a0, 4
+; RV32-NEXT: .LBB0_2:
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v4i32:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmsne.vi v0, v8, 0
-; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v9
-; RV64-NEXT: vrsub.vi v9, v9, 4
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: vredmaxu.vs v8, v8, v8
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: li a1, 4
-; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: zext.b a0, a1
+; RV64-NEXT: vmsne.vi v8, v8, 0
+; RV64-NEXT: vfirst.m a0, v8
+; RV64-NEXT: bgez a0, .LBB0_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a0, 4
+; RV64-NEXT: .LBB0_2:
; RV64-NEXT: ret
%res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
ret i16 %res
@@ -45,2188 +33,398 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
define i16 @ctz_v2048i1(<2048 x i1> %a) {
; RV32-LABEL: ctz_v2048i1:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xef, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 111 * vlenb
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 5
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv1r.v v0, v22
-; RV32-NEXT: li a0, 64
; RV32-NEXT: li a1, 128
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v16, v8, -1, v0
-; RV32-NEXT: vid.v v8
-; RV32-NEXT: vrsub.vx v24, v8, a1
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: li a1, 1152
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vrsub.vx v0, v8, a1
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vmaxu.vv v16, v24, v16
-; RV32-NEXT: li a1, 640
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vrsub.vx v0, v8, a1
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: li a1, 1664
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a1
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vmaxu.vv v8, v8, v24
-; RV32-NEXT: li a1, 384
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a1
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vmaxu.vv v8, v8, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: li a1, 1408
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmaxu.vv v8, v8, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: li a1, 896
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: li a1, 1920
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmerge.vim v16, v24, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a1
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vmaxu.vv v16, v16, v8
-; RV32-NEXT: li a1, 256
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a1
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: li a1, 1280
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: vmerge.vim v16, v16, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a1
-; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v8, v0
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmaxu.vv v16, v16, v24
-; RV32-NEXT: li a1, 768
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a1
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: li a1, 1792
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a1
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vmaxu.vv v8, v24, v8
-; RV32-NEXT: li a1, 512
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a1
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vmaxu.vv v8, v8, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: li a1, 1536
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v5, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv1r.v v0, v5
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmaxu.vv v8, v8, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vfirst.m a0, v0
+; RV32-NEXT: bltz a0, .LBB1_32
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: vfirst.m a2, v8
+; RV32-NEXT: bltz a2, .LBB1_33
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: beq a0, a1, .LBB1_34
+; RV32-NEXT: .LBB1_3:
+; RV32-NEXT: vfirst.m a3, v9
+; RV32-NEXT: bltz a3, .LBB1_35
+; RV32-NEXT: .LBB1_4:
+; RV32-NEXT: vfirst.m a2, v10
+; RV32-NEXT: bltz a2, .LBB1_36
+; RV32-NEXT: .LBB1_5:
+; RV32-NEXT: beq a3, a1, .LBB1_37
+; RV32-NEXT: .LBB1_6:
+; RV32-NEXT: li a2, 256
+; RV32-NEXT: beq a0, a2, .LBB1_38
+; RV32-NEXT: .LBB1_7:
+; RV32-NEXT: vfirst.m a4, v11
+; RV32-NEXT: bltz a4, .LBB1_39
+; RV32-NEXT: .LBB1_8:
+; RV32-NEXT: vfirst.m a3, v12
+; RV32-NEXT: bltz a3, .LBB1_40
+; RV32-NEXT: .LBB1_9:
+; RV32-NEXT: beq a4, a1, .LBB1_41
+; RV32-NEXT: .LBB1_10:
+; RV32-NEXT: vfirst.m a3, v13
+; RV32-NEXT: bltz a3, .LBB1_42
+; RV32-NEXT: .LBB1_11:
+; RV32-NEXT: vfirst.m a5, v14
+; RV32-NEXT: bltz a5, .LBB1_43
+; RV32-NEXT: .LBB1_12:
+; RV32-NEXT: beq a3, a1, .LBB1_44
+; RV32-NEXT: .LBB1_13:
+; RV32-NEXT: beq a4, a2, .LBB1_45
+; RV32-NEXT: .LBB1_14:
+; RV32-NEXT: li a3, 512
+; RV32-NEXT: beq a0, a3, .LBB1_46
+; RV32-NEXT: .LBB1_15:
+; RV32-NEXT: vfirst.m a4, v15
+; RV32-NEXT: bltz a4, .LBB1_47
+; RV32-NEXT: .LBB1_16:
+; RV32-NEXT: vfirst.m a5, v16
+; RV32-NEXT: bltz a5, .LBB1_48
+; RV32-NEXT: .LBB1_17:
+; RV32-NEXT: beq a4, a1, .LBB1_49
+; RV32-NEXT: .LBB1_18:
+; RV32-NEXT: vfirst.m a5, v17
+; RV32-NEXT: bltz a5, .LBB1_50
+; RV32-NEXT: .LBB1_19:
+; RV32-NEXT: vfirst.m a6, v18
+; RV32-NEXT: bltz a6, .LBB1_51
+; RV32-NEXT: .LBB1_20:
+; RV32-NEXT: beq a5, a1, .LBB1_52
+; RV32-NEXT: .LBB1_21:
+; RV32-NEXT: beq a4, a2, .LBB1_53
+; RV32-NEXT: .LBB1_22:
+; RV32-NEXT: vfirst.m a5, v19
+; RV32-NEXT: bltz a5, .LBB1_54
+; RV32-NEXT: .LBB1_23:
+; RV32-NEXT: vfirst.m a6, v20
+; RV32-NEXT: bltz a6, .LBB1_55
+; RV32-NEXT: .LBB1_24:
+; RV32-NEXT: beq a5, a1, .LBB1_56
+; RV32-NEXT: .LBB1_25:
+; RV32-NEXT: vfirst.m a6, v21
+; RV32-NEXT: bltz a6, .LBB1_57
+; RV32-NEXT: .LBB1_26:
+; RV32-NEXT: vfirst.m a7, v22
+; RV32-NEXT: bltz a7, .LBB1_58
+; RV32-NEXT: .LBB1_27:
+; RV32-NEXT: beq a6, a1, .LBB1_59
+; RV32-NEXT: .LBB1_28:
+; RV32-NEXT: beq a5, a2, .LBB1_60
+; RV32-NEXT: .LBB1_29:
+; RV32-NEXT: beq a4, a3, .LBB1_61
+; RV32-NEXT: .LBB1_30:
+; RV32-NEXT: li a1, 1024
+; RV32-NEXT: beq a0, a1, .LBB1_62
+; RV32-NEXT: .LBB1_31:
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB1_32:
+; RV32-NEXT: li a0, 128
+; RV32-NEXT: vfirst.m a2, v8
+; RV32-NEXT: bgez a2, .LBB1_2
+; RV32-NEXT: .LBB1_33:
+; RV32-NEXT: li a2, 128
+; RV32-NEXT: bne a0, a1, .LBB1_3
+; RV32-NEXT: .LBB1_34:
+; RV32-NEXT: addi a0, a2, 128
+; RV32-NEXT: vfirst.m a3, v9
+; RV32-NEXT: bgez a3, .LBB1_4
+; RV32-NEXT: .LBB1_35:
+; RV32-NEXT: li a3, 128
+; RV32-NEXT: vfirst.m a2, v10
+; RV32-NEXT: bgez a2, .LBB1_5
+; RV32-NEXT: .LBB1_36:
+; RV32-NEXT: li a2, 128
+; RV32-NEXT: bne a3, a1, .LBB1_6
+; RV32-NEXT: .LBB1_37:
+; RV32-NEXT: addi a3, a2, 128
+; RV32-NEXT: li a2, 256
+; RV32-NEXT: bne a0, a2, .LBB1_7
+; RV32-NEXT: .LBB1_38:
+; RV32-NEXT: addi a0, a3, 256
+; RV32-NEXT: vfirst.m a4, v11
+; RV32-NEXT: bgez a4, .LBB1_8
+; RV32-NEXT: .LBB1_39:
+; RV32-NEXT: li a4, 128
+; RV32-NEXT: vfirst.m a3, v12
+; RV32-NEXT: bgez a3, .LBB1_9
+; RV32-NEXT: .LBB1_40:
+; RV32-NEXT: li a3, 128
+; RV32-NEXT: bne a4, a1, .LBB1_10
+; RV32-NEXT: .LBB1_41:
+; RV32-NEXT: addi a4, a3, 128
+; RV32-NEXT: vfirst.m a3, v13
+; RV32-NEXT: bgez a3, .LBB1_11
+; RV32-NEXT: .LBB1_42:
+; RV32-NEXT: li a3, 128
+; RV32-NEXT: vfirst.m a5, v14
+; RV32-NEXT: bgez a5, .LBB1_12
+; RV32-NEXT: .LBB1_43:
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: bne a3, a1, .LBB1_13
+; RV32-NEXT: .LBB1_44:
+; RV32-NEXT: addi a3, a5, 128
+; RV32-NEXT: bne a4, a2, .LBB1_14
+; RV32-NEXT: .LBB1_45:
+; RV32-NEXT: addi a4, a3, 256
+; RV32-NEXT: li a3, 512
+; RV32-NEXT: bne a0, a3, .LBB1_15
+; RV32-NEXT: .LBB1_46:
+; RV32-NEXT: addi a0, a4, 512
+; RV32-NEXT: vfirst.m a4, v15
+; RV32-NEXT: bgez a4, .LBB1_16
+; RV32-NEXT: .LBB1_47:
+; RV32-NEXT: li a4, 128
+; RV32-NEXT: vfirst.m a5, v16
+; RV32-NEXT: bgez a5, .LBB1_17
+; RV32-NEXT: .LBB1_48:
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: bne a4, a1, .LBB1_18
+; RV32-NEXT: .LBB1_49:
+; RV32-NEXT: addi a4, a5, 128
+; RV32-NEXT: vfirst.m a5, v17
+; RV32-NEXT: bgez a5, .LBB1_19
+; RV32-NEXT: .LBB1_50:
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: vfirst.m a6, v18
+; RV32-NEXT: bgez a6, .LBB1_20
+; RV32-NEXT: .LBB1_51:
+; RV32-NEXT: li a6, 128
+; RV32-NEXT: bne a5, a1, .LBB1_21
+; RV32-NEXT: .LBB1_52:
+; RV32-NEXT: addi a5, a6, 128
+; RV32-NEXT: bne a4, a2, .LBB1_22
+; RV32-NEXT: .LBB1_53:
+; RV32-NEXT: addi a4, a5, 256
+; RV32-NEXT: vfirst.m a5, v19
+; RV32-NEXT: bgez a5, .LBB1_23
+; RV32-NEXT: .LBB1_54:
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: vfirst.m a6, v20
+; RV32-NEXT: bgez a6, .LBB1_24
+; RV32-NEXT: .LBB1_55:
+; RV32-NEXT: li a6, 128
+; RV32-NEXT: bne a5, a1, .LBB1_25
+; RV32-NEXT: .LBB1_56:
+; RV32-NEXT: addi a5, a6, 128
+; RV32-NEXT: vfirst.m a6, v21
+; RV32-NEXT: bgez a6, .LBB1_26
+; RV32-NEXT: .LBB1_57:
+; RV32-NEXT: li a6, 128
+; RV32-NEXT: vfirst.m a7, v22
+; RV32-NEXT: bgez a7, .LBB1_27
+; RV32-NEXT: .LBB1_58:
+; RV32-NEXT: li a7, 128
+; RV32-NEXT: bne a6, a1, .LBB1_28
+; RV32-NEXT: .LBB1_59:
+; RV32-NEXT: addi a6, a7, 128
+; RV32-NEXT: bne a5, a2, .LBB1_29
+; RV32-NEXT: .LBB1_60:
+; RV32-NEXT: addi a5, a6, 256
+; RV32-NEXT: bne a4, a3, .LBB1_30
+; RV32-NEXT: .LBB1_61:
+; RV32-NEXT: addi a4, a5, 512
; RV32-NEXT: li a1, 1024
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v6, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv1r.v v0, v6
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a1
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: slli a1, a1, 11
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a3, a2, 5
-; RV32-NEXT: add a2, a3, a2
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: vmerge.vim v16, v24, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a1
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 192
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v16, v8
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1216
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v16, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 704
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1728
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 448
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v5, 8
-; RV32-NEXT: li a2, 1472
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v16, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v6, 8
-; RV32-NEXT: li a2, 960
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vrsub.vx v16, v24, a2
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v7, 8
-; RV32-NEXT: li a2, 1984
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vmaxu.vv v16, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 320
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v16, v8
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1344
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v16, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 832
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a2
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1856
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vmaxu.vv v16, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 576
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmerge.vim v8, v8, -1, v0
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1600
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: vid.v v16
-; RV32-NEXT: vrsub.vx v16, v16, a2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v16, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: mv a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a3, a3, a2
-; RV32-NEXT: slli a2, a2, 2
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v8, 8
-; RV32-NEXT: li a2, 1088
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vmerge.vim v8, v24, -1, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: mv a4, a3
-; RV32-NEXT: slli a3, a3, 1
-; RV32-NEXT: add a4, a4, a3
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl1r.v v24, (a3) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v0, v24, 8
-; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV32-NEXT: vid.v v24
-; RV32-NEXT: vrsub.vx v24, v24, a2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vmv.v.i v24, 0
-; RV32-NEXT: vmerge.vim v24, v24, -1, v0
-; RV32-NEXT: vid.v v0
-; RV32-NEXT: vrsub.vx v0, v0, a0
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vmaxu.vv v8, v8, v24
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add a2, a2, a0
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmaxu.vv v8, v16, v8
-; RV32-NEXT: vredmaxu.vs v8, v8, v8
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sub a0, a1, a0
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: mv a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 2
-; RV32-NEXT: add a2, a2, a1
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add sp, sp, a1
-; RV32-NEXT: .cfi_def_cfa sp, 16
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: bne a0, a1, .LBB1_31
+; RV32-NEXT: .LBB1_62:
+; RV32-NEXT: addi a0, a4, 1024
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v2048i1:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xef, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 111 * vlenb
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a1, a1, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 5
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv1r.v v0, v22
-; RV64-NEXT: li a0, 64
; RV64-NEXT: li a1, 128
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v16, v8, -1, v0
-; RV64-NEXT: vid.v v8
-; RV64-NEXT: vrsub.vx v24, v8, a1
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: li a1, 1152
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vrsub.vx v0, v8, a1
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: vmaxu.vv v16, v24, v16
-; RV64-NEXT: li a1, 640
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vrsub.vx v0, v8, a1
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: li a1, 1664
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a1
-; RV64-NEXT: vand.vv v8, v8, v0
-; RV64-NEXT: vmaxu.vv v8, v8, v24
-; RV64-NEXT: li a1, 384
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a1
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: vmaxu.vv v8, v8, v16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: li a1, 1408
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vmaxu.vv v8, v8, v24
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: li a1, 896
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: li a1, 1920
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmerge.vim v16, v24, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a1
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: vmaxu.vv v16, v16, v8
-; RV64-NEXT: li a1, 256
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a1
-; RV64-NEXT: vand.vv v24, v8, v24
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: li a1, 1280
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vim v16, v16, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a1
-; RV64-NEXT: vand.vv v16, v16, v0
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v8, v0
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vmaxu.vv v16, v16, v24
-; RV64-NEXT: li a1, 768
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a1
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: li a1, 1792
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a1
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: vmaxu.vv v8, v24, v8
-; RV64-NEXT: li a1, 512
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a1
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: vmaxu.vv v8, v8, v16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: li a1, 1536
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 5
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v5, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv1r.v v0, v5
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vmaxu.vv v8, v8, v24
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vfirst.m a0, v0
+; RV64-NEXT: bltz a0, .LBB1_32
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: vfirst.m a2, v8
+; RV64-NEXT: bltz a2, .LBB1_33
+; RV64-NEXT: .LBB1_2:
+; RV64-NEXT: beq a0, a1, .LBB1_34
+; RV64-NEXT: .LBB1_3:
+; RV64-NEXT: vfirst.m a3, v9
+; RV64-NEXT: bltz a3, .LBB1_35
+; RV64-NEXT: .LBB1_4:
+; RV64-NEXT: vfirst.m a2, v10
+; RV64-NEXT: bltz a2, .LBB1_36
+; RV64-NEXT: .LBB1_5:
+; RV64-NEXT: beq a3, a1, .LBB1_37
+; RV64-NEXT: .LBB1_6:
+; RV64-NEXT: li a2, 256
+; RV64-NEXT: beq a0, a2, .LBB1_38
+; RV64-NEXT: .LBB1_7:
+; RV64-NEXT: vfirst.m a4, v11
+; RV64-NEXT: bltz a4, .LBB1_39
+; RV64-NEXT: .LBB1_8:
+; RV64-NEXT: vfirst.m a3, v12
+; RV64-NEXT: bltz a3, .LBB1_40
+; RV64-NEXT: .LBB1_9:
+; RV64-NEXT: beq a4, a1, .LBB1_41
+; RV64-NEXT: .LBB1_10:
+; RV64-NEXT: vfirst.m a3, v13
+; RV64-NEXT: bltz a3, .LBB1_42
+; RV64-NEXT: .LBB1_11:
+; RV64-NEXT: vfirst.m a5, v14
+; RV64-NEXT: bltz a5, .LBB1_43
+; RV64-NEXT: .LBB1_12:
+; RV64-NEXT: beq a3, a1, .LBB1_44
+; RV64-NEXT: .LBB1_13:
+; RV64-NEXT: beq a4, a2, .LBB1_45
+; RV64-NEXT: .LBB1_14:
+; RV64-NEXT: li a3, 512
+; RV64-NEXT: beq a0, a3, .LBB1_46
+; RV64-NEXT: .LBB1_15:
+; RV64-NEXT: vfirst.m a4, v15
+; RV64-NEXT: bltz a4, .LBB1_47
+; RV64-NEXT: .LBB1_16:
+; RV64-NEXT: vfirst.m a5, v16
+; RV64-NEXT: bltz a5, .LBB1_48
+; RV64-NEXT: .LBB1_17:
+; RV64-NEXT: beq a4, a1, .LBB1_49
+; RV64-NEXT: .LBB1_18:
+; RV64-NEXT: vfirst.m a5, v17
+; RV64-NEXT: bltz a5, .LBB1_50
+; RV64-NEXT: .LBB1_19:
+; RV64-NEXT: vfirst.m a6, v18
+; RV64-NEXT: bltz a6, .LBB1_51
+; RV64-NEXT: .LBB1_20:
+; RV64-NEXT: beq a5, a1, .LBB1_52
+; RV64-NEXT: .LBB1_21:
+; RV64-NEXT: beq a4, a2, .LBB1_53
+; RV64-NEXT: .LBB1_22:
+; RV64-NEXT: vfirst.m a5, v19
+; RV64-NEXT: bltz a5, .LBB1_54
+; RV64-NEXT: .LBB1_23:
+; RV64-NEXT: vfirst.m a6, v20
+; RV64-NEXT: bltz a6, .LBB1_55
+; RV64-NEXT: .LBB1_24:
+; RV64-NEXT: beq a5, a1, .LBB1_56
+; RV64-NEXT: .LBB1_25:
+; RV64-NEXT: vfirst.m a6, v21
+; RV64-NEXT: bltz a6, .LBB1_57
+; RV64-NEXT: .LBB1_26:
+; RV64-NEXT: vfirst.m a7, v22
+; RV64-NEXT: bltz a7, .LBB1_58
+; RV64-NEXT: .LBB1_27:
+; RV64-NEXT: beq a6, a1, .LBB1_59
+; RV64-NEXT: .LBB1_28:
+; RV64-NEXT: beq a5, a2, .LBB1_60
+; RV64-NEXT: .LBB1_29:
+; RV64-NEXT: beq a4, a3, .LBB1_61
+; RV64-NEXT: .LBB1_30:
+; RV64-NEXT: li a1, 1024
+; RV64-NEXT: beq a0, a1, .LBB1_62
+; RV64-NEXT: .LBB1_31:
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB1_32:
+; RV64-NEXT: li a0, 128
+; RV64-NEXT: vfirst.m a2, v8
+; RV64-NEXT: bgez a2, .LBB1_2
+; RV64-NEXT: .LBB1_33:
+; RV64-NEXT: li a2, 128
+; RV64-NEXT: bne a0, a1, .LBB1_3
+; RV64-NEXT: .LBB1_34:
+; RV64-NEXT: addi a0, a2, 128
+; RV64-NEXT: vfirst.m a3, v9
+; RV64-NEXT: bgez a3, .LBB1_4
+; RV64-NEXT: .LBB1_35:
+; RV64-NEXT: li a3, 128
+; RV64-NEXT: vfirst.m a2, v10
+; RV64-NEXT: bgez a2, .LBB1_5
+; RV64-NEXT: .LBB1_36:
+; RV64-NEXT: li a2, 128
+; RV64-NEXT: bne a3, a1, .LBB1_6
+; RV64-NEXT: .LBB1_37:
+; RV64-NEXT: addi a3, a2, 128
+; RV64-NEXT: li a2, 256
+; RV64-NEXT: bne a0, a2, .LBB1_7
+; RV64-NEXT: .LBB1_38:
+; RV64-NEXT: addi a0, a3, 256
+; RV64-NEXT: vfirst.m a4, v11
+; RV64-NEXT: bgez a4, .LBB1_8
+; RV64-NEXT: .LBB1_39:
+; RV64-NEXT: li a4, 128
+; RV64-NEXT: vfirst.m a3, v12
+; RV64-NEXT: bgez a3, .LBB1_9
+; RV64-NEXT: .LBB1_40:
+; RV64-NEXT: li a3, 128
+; RV64-NEXT: bne a4, a1, .LBB1_10
+; RV64-NEXT: .LBB1_41:
+; RV64-NEXT: addi a4, a3, 128
+; RV64-NEXT: vfirst.m a3, v13
+; RV64-NEXT: bgez a3, .LBB1_11
+; RV64-NEXT: .LBB1_42:
+; RV64-NEXT: li a3, 128
+; RV64-NEXT: vfirst.m a5, v14
+; RV64-NEXT: bgez a5, .LBB1_12
+; RV64-NEXT: .LBB1_43:
+; RV64-NEXT: li a5, 128
+; RV64-NEXT: bne a3, a1, .LBB1_13
+; RV64-NEXT: .LBB1_44:
+; RV64-NEXT: addi a3, a5, 128
+; RV64-NEXT: bne a4, a2, .LBB1_14
+; RV64-NEXT: .LBB1_45:
+; RV64-NEXT: addi a4, a3, 256
+; RV64-NEXT: li a3, 512
+; RV64-NEXT: bne a0, a3, .LBB1_15
+; RV64-NEXT: .LBB1_46:
+; RV64-NEXT: addi a0, a4, 512
+; RV64-NEXT: vfirst.m a4, v15
+; RV64-NEXT: bgez a4, .LBB1_16
+; RV64-NEXT: .LBB1_47:
+; RV64-NEXT: li a4, 128
+; RV64-NEXT: vfirst.m a5, v16
+; RV64-NEXT: bgez a5, .LBB1_17
+; RV64-NEXT: .LBB1_48:
+; RV64-NEXT: li a5, 128
+; RV64-NEXT: bne a4, a1, .LBB1_18
+; RV64-NEXT: .LBB1_49:
+; RV64-NEXT: addi a4, a5, 128
+; RV64-NEXT: vfirst.m a5, v17
+; RV64-NEXT: bgez a5, .LBB1_19
+; RV64-NEXT: .LBB1_50:
+; RV64-NEXT: li a5, 128
+; RV64-NEXT: vfirst.m a6, v18
+; RV64-NEXT: bgez a6, .LBB1_20
+; RV64-NEXT: .LBB1_51:
+; RV64-NEXT: li a6, 128
+; RV64-NEXT: bne a5, a1, .LBB1_21
+; RV64-NEXT: .LBB1_52:
+; RV64-NEXT: addi a5, a6, 128
+; RV64-NEXT: bne a4, a2, .LBB1_22
+; RV64-NEXT: .LBB1_53:
+; RV64-NEXT: addi a4, a5, 256
+; RV64-NEXT: vfirst.m a5, v19
+; RV64-NEXT: bgez a5, .LBB1_23
+; RV64-NEXT: .LBB1_54:
+; RV64-NEXT: li a5, 128
+; RV64-NEXT: vfirst.m a6, v20
+; RV64-NEXT: bgez a6, .LBB1_24
+; RV64-NEXT: .LBB1_55:
+; RV64-NEXT: li a6, 128
+; RV64-NEXT: bne a5, a1, .LBB1_25
+; RV64-NEXT: .LBB1_56:
+; RV64-NEXT: addi a5, a6, 128
+; RV64-NEXT: vfirst.m a6, v21
+; RV64-NEXT: bgez a6, .LBB1_26
+; RV64-NEXT: .LBB1_57:
+; RV64-NEXT: li a6, 128
+; RV64-NEXT: vfirst.m a7, v22
+; RV64-NEXT: bgez a7, .LBB1_27
+; RV64-NEXT: .LBB1_58:
+; RV64-NEXT: li a7, 128
+; RV64-NEXT: bne a6, a1, .LBB1_28
+; RV64-NEXT: .LBB1_59:
+; RV64-NEXT: addi a6, a7, 128
+; RV64-NEXT: bne a5, a2, .LBB1_29
+; RV64-NEXT: .LBB1_60:
+; RV64-NEXT: addi a5, a6, 256
+; RV64-NEXT: bne a4, a3, .LBB1_30
+; RV64-NEXT: .LBB1_61:
+; RV64-NEXT: addi a4, a5, 512
; RV64-NEXT: li a1, 1024
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v6, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv1r.v v0, v6
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a1
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: li a1, 1
-; RV64-NEXT: slli a1, a1, 11
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: add a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vmerge.vim v16, v24, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a1
-; RV64-NEXT: vand.vv v16, v16, v24
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 192
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a2
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1216
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v16, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 704
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1728
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a2
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 448
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v5, 8
-; RV64-NEXT: li a2, 1472
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v16, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v6, 8
-; RV64-NEXT: li a2, 960
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vrsub.vx v16, v24, a2
-; RV64-NEXT: vand.vv v16, v8, v16
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v7, 8
-; RV64-NEXT: li a2, 1984
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vmaxu.vv v16, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 320
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1344
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v16, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 832
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a2
-; RV64-NEXT: vand.vv v16, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1856
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vmaxu.vv v16, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 576
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmerge.vim v8, v8, -1, v0
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1600
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vrsub.vx v16, v16, a2
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v16, v8, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: mv a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a3, a3, a2
-; RV64-NEXT: slli a2, a2, 2
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v8, 8
-; RV64-NEXT: li a2, 1088
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vmerge.vim v8, v24, -1, v0
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: mv a4, a3
-; RV64-NEXT: slli a3, a3, 1
-; RV64-NEXT: add a4, a4, a3
-; RV64-NEXT: slli a3, a3, 4
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: add a3, sp, a3
-; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vl1r.v v24, (a3) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v24, 8
-; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; RV64-NEXT: vid.v v24
-; RV64-NEXT: vrsub.vx v24, v24, a2
-; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vmv.v.i v24, 0
-; RV64-NEXT: vmerge.vim v24, v24, -1, v0
-; RV64-NEXT: vid.v v0
-; RV64-NEXT: vrsub.vx v0, v0, a0
-; RV64-NEXT: vand.vv v24, v24, v0
-; RV64-NEXT: vmaxu.vv v8, v8, v24
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add a2, a2, a0
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmaxu.vv v8, v16, v8
-; RV64-NEXT: vredmaxu.vs v8, v8, v8
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sub a0, a1, a0
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: mv a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 2
-; RV64-NEXT: add a2, a2, a1
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add sp, sp, a1
-; RV64-NEXT: .cfi_def_cfa sp, 16
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: bne a0, a1, .LBB1_31
+; RV64-NEXT: .LBB1_62:
+; RV64-NEXT: addi a0, a4, 1024
; RV64-NEXT: ret
%res = call i16 @llvm.experimental.cttz.elts(<2048 x i1> %a, i1 0)
ret i16 %res
>From 6f2debcc5f67c411d13911dc608ab49f7f6d9bcf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 00:09:43 +0800
Subject: [PATCH 3/8] Address first round of review comments
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 2 +-
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 1 -
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 7 ++++---
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7af934635b51b..807863b46606d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4841,7 +4841,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CttzElts(SDNode *N) {
GetSplitVector(VecOp, Lo, Hi);
// if CTTZ_ELTS(Lo) != VL => CTTZ_ELTS(Lo).
- // else => VL + (CTTZ_ELTS(Hi) or CTTZ_ELTS_ZERO_UNDEF(Hi)).
+ // else => VL + (CTTZ_ELTS(Hi) or CTTZ_ELTS_ZERO_POISON(Hi)).
SDValue ResLo = DAG.getNode(ISD::CTTZ_ELTS, DL, ResVT, Lo);
SDValue VL =
DAG.getElementCount(DL, ResVT, Lo.getValueType().getVectorElementCount());
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5ca4f3bc3a9eb..2de519e451f47 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8324,7 +8324,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_cttz_elts: {
- auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
bool ZeroIsPoison =
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 2d5294c6a53c0..194f1da5f95a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12596,9 +12596,10 @@ SDValue TargetLowering::expandCttzElts(SDNode *Node, SelectionDAG &DAG) const {
SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
SDValue Ext = DAG.getSExtOrTrunc(Op, DL, NewVT);
SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
- SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltVT, And);
- SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltVT,
- DAG.getZExtOrTrunc(VL, DL, NewEltVT), Max);
+ SDValue Max =
+ DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewVT.getVectorElementType(), And);
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltVT, VL,
+ DAG.getZExtOrTrunc(Max, DL, NewEltVT));
return DAG.getZExtOrTrunc(Sub, DL, VT);
}
>From eda9a7c29dfc4a4339fb139758be4edd24c61cb4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 00:39:26 +0800
Subject: [PATCH 4/8] Address AArch64 cost model comments
---
.../AArch64/AArch64TargetTransformInfo.cpp | 16 +++++++--
.../Analysis/CostModel/AArch64/cttz_elts.ll | 34 +++++++++----------
2 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c7f66ddd8dbdc..659fc8be47783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1079,11 +1079,23 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
case Intrinsic::experimental_cttz_elts: {
Type *ArgTy = ICA.getArgTypes()[0];
- if (ST->isSVEorStreamingSVEAvailable()) {
+ auto LT = getTypeLegalizationCost(ArgTy);
+ // We always try to lower via an i1 vector first, so check if CTTZ_ELTS is
+ // legal or custom for it. The type may be illegal so we can't use
+ // isOperationLegalOrCustom.
+ LT.second = MVT::getVectorVT(MVT::i1, LT.second.getVectorElementCount());
+ TargetLowering::LegalizeAction OpAction =
+ TLI->getOperationAction(ISD::CTTZ_ELTS, LT.second);
+ if (OpAction == TargetLowering::Legal ||
+ OpAction == TargetLowering::Custom) {
// This will consist of a SVE brkb and a cntp instruction. These
// typically have the same latency and half the throughput as a vector
// add instruction.
- return getTypeLegalizationCost(ArgTy).first * 4;
+ InstructionCost Cost = LT.first * 4;
+ // Type splitting requires a cmp and csel.
+ if (LT.first > 1)
+ Cost += (LT.first - 1) * 2;
+ return Cost;
}
break;
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index 2bcf4e6b1e153..761c27d2cdc7f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -3,47 +3,47 @@
define void @foo_no_vscale_range() {
; CHECK-LABEL: 'foo_no_vscale_range'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of Invalid for: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
@@ -101,22 +101,22 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -150,22 +150,22 @@ define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 8 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
>From e82d00613896448f91fe3a9c7d311e9d6e3b83d0 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 13:10:44 +0800
Subject: [PATCH 5/8] Promote cttz_elts* ops, don't custom lower it for NEON
This gives better lowering and avoids the need to do the "illegal type but customer lowering for illegal type" path
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 9 +++++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../Target/AArch64/AArch64ISelLowering.cpp | 8 --------
.../CodeGen/AArch64/intrinsic-cttz-elts.ll | 20 ++++++++-----------
4 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 4a27f804d6720..046f65657fe1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2163,6 +2163,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::VECTOR_FIND_LAST_ACTIVE:
Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo);
break;
+ case ISD::CTTZ_ELTS:
+ case ISD::CTTZ_ELTS_ZERO_POISON:
+ Res = PromoteIntOp_CTTZ_ELTS(N);
+ break;
case ISD::GET_ACTIVE_LANE_MASK:
Res = PromoteIntOp_GET_ACTIVE_LANE_MASK(N);
break;
@@ -2996,6 +3000,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N,
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_CTTZ_ELTS(SDNode *N) {
+ SDValue Op = GetPromotedInteger(N->getOperand(0));
+ return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N) {
SmallVector<SDValue, 1> NewOps(N->ops());
NewOps[0] = ZExtPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ff5280d219c5e..4a85c7ab39a32 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -419,6 +419,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_CTTZ_ELTS(SDNode *N);
SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N);
SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N);
SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index babfb3051ab7b..9f1cca5c9ef88 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2135,11 +2135,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- // Set to custom so we can expand cttz.elts during type legalization for NEON
- for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
- setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON}, VT,
- Custom);
-
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
// Only required for llvm.aarch64.mops.memset.tag
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
@@ -8486,9 +8481,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
EVT VT = CttzOp.getValueType();
assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
- if (!Subtarget->isSVEorStreamingSVEAvailable())
- return expandCttzElts(Op.getNode(), DAG);
-
if (VT.isFixedLengthVector()) {
// We can use SVE instructions to lower this intrinsic by first creating
// an SVE predicate register mask from the fixed-width vector.
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
index 1bf000c07edbc..56720d62c019f 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
@@ -14,12 +14,11 @@ define i8 @ctz_v8i1(<8 x i1> %a) {
; CHECK-NEXT: .byte 1
; CHECK-LABEL: ctz_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b
; CHECK-NEXT: umaxv b0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: sub w0, w9, w8
@@ -48,12 +47,11 @@ define i32 @ctz_v16i1(<16 x i1> %a) {
; CHECK-NEXT: .byte 1
; CHECK-LABEL: ctz_v16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: sub w0, w9, w8
@@ -96,12 +94,11 @@ define i7 @ctz_i7_v8i1(<8 x i1> %a) {
; CHECK-NEXT: .byte 1
; CHECK-LABEL: ctz_i7_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b
; CHECK-NEXT: umaxv b0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: sub w0, w9, w8
@@ -124,12 +121,11 @@ define i8 @ctz_v8i1_poison(<8 x i1> %a) {
; CHECK-NEXT: .byte 1
; CHECK-LABEL: ctz_v8i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b
; CHECK-NEXT: umaxv b0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: sub w0, w9, w8
>From 92b3052c9fb9f43108a0457c5bf3195082cda0e8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 14:30:00 +0800
Subject: [PATCH 6/8] Remove TTI changes for now
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 11 +++----
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +++
.../Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 ++
.../AArch64/AArch64TargetTransformInfo.cpp | 19 ++---------
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +++
llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 ++
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 10 +++---
.../Analysis/CostModel/AArch64/cttz_elts.ll | 32 +++++++++----------
.../Analysis/CostModel/RISCV/cttz_elts.ll | 16 +++++-----
10 files changed, 61 insertions(+), 51 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 803c6fbf93978..4cd97e726122d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2115,17 +2115,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
case Intrinsic::experimental_cttz_elts: {
- EVT RetType = getTLI()->getValueType(DL, ICA.getReturnType(), true);
EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
// If we're not expanding the intrinsic then we assume this is cheap
// to implement.
- auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
- if (getTLI()->isOperationLegalOrCustom(ISD::CTTZ_ELTS, LT.second))
- return LT.first;
+ if (!getTLI()->shouldExpandCttzElements(ArgType))
+ return getTypeLegalizationCost(RetTy).first;
// TODO: The costs below reflect the expansion code in
- // TargetLowering, but we may want to sacrifice some accuracy in
+ // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
// favour of compile time.
// Find the smallest "sensible" element type to use for the expansion.
@@ -2135,7 +2133,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
VScaleRange = getVScaleRange(I->getCaller(), 64);
unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
- RetType, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
+ getTLI()->getValueType(DL, RetTy), ArgType.getVectorElementCount(),
+ ZeroIsPoison, &VScaleRange);
Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
// Create the new vector type & get the vector length
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 832df3ea26f59..39ebee1a0500a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -492,6 +492,10 @@ class LLVM_ABI TargetLoweringBase {
return true;
}
+ /// Return true if the @llvm.experimental.cttz.elts intrinsic should be
+ /// expanded using generic code in SelectionDAGBuilder.
+ virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
+
/// Return the minimum number of bits required to hold the maximum possible
/// number of trailing zero vector elements.
unsigned getBitWidthForCttzElements(EVT RetVT, ElementCount EC,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f1cca5c9ef88..38db1ac4a2fb9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2338,6 +2338,17 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
return false;
}
+bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
+ if (!Subtarget->isSVEorStreamingSVEAvailable())
+ return true;
+
+ // We can only use the BRKB + CNTP sequence with legal predicate types. We can
+ // also support fixed-width predicates.
+ return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
+ VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
+ VT != MVT::v4i1 && VT != MVT::v2i1;
+}
+
bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
unsigned SearchSize) const {
// MATCH is SVE2 and only available in non-streaming mode.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 07b80b2c50a40..49ff76bb2f469 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -553,6 +553,8 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
+ bool shouldExpandCttzElements(EVT VT) const override;
+
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;
/// If a change in streaming mode is required on entry to/return from a
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 659fc8be47783..ae7144155ad72 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1077,25 +1077,12 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
break;
}
case Intrinsic::experimental_cttz_elts: {
- Type *ArgTy = ICA.getArgTypes()[0];
-
- auto LT = getTypeLegalizationCost(ArgTy);
- // We always try to lower via an i1 vector first, so check if CTTZ_ELTS is
- // legal or custom for it. The type may be illegal so we can't use
- // isOperationLegalOrCustom.
- LT.second = MVT::getVectorVT(MVT::i1, LT.second.getVectorElementCount());
- TargetLowering::LegalizeAction OpAction =
- TLI->getOperationAction(ISD::CTTZ_ELTS, LT.second);
- if (OpAction == TargetLowering::Legal ||
- OpAction == TargetLowering::Custom) {
+ EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
// This will consist of a SVE brkb and a cntp instruction. These
// typically have the same latency and half the throughput as a vector
// add instruction.
- InstructionCost Cost = LT.first * 4;
- // Type splitting requires a cmp and csel.
- if (LT.first > 1)
- Cost += (LT.first - 1) * 2;
- return Cost;
+ return 4;
}
break;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0758764f5f543..7c1eacbce3701 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2014,6 +2014,11 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
return VF > MaxVF || !isPowerOf2_32(VF);
}
+bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const {
+ return !Subtarget.hasVInstructions() ||
+ VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
+}
+
void RISCVTargetLowering::getTgtMemIntrinsic(
SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,
MachineFunction &MF, unsigned Intrinsic) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index cd2609d8b604b..8d88aeb7ae3fc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -120,6 +120,8 @@ class RISCVTargetLowering : public TargetLowering {
shouldExpandBuildVectorWithShuffles(EVT VT,
unsigned DefinedValues) const override;
+ bool shouldExpandCttzElements(EVT VT) const override;
+
/// Return the cost of LMUL for linear operations.
InstructionCost getLMULCost(MVT VT) const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fee3173c54ada..3dca57370e035 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1656,11 +1656,11 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
case Intrinsic::experimental_cttz_elts: {
Type *ArgTy = ICA.getArgTypes()[0];
- auto LT = getTypeLegalizationCost(ArgTy);
- if (!LT.second.isVector())
+ EVT ArgType = TLI->getValueType(DL, ArgTy, true);
+ if (getTLI()->shouldExpandCttzElements(ArgType))
break;
- InstructionCost Cost = LT.first * getRISCVInstructionCost(
- RISCV::VFIRST_M, LT.second, CostKind);
+ InstructionCost Cost = getRISCVInstructionCost(
+ RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
// If zero_is_poison is false, then we will generate additional
// cmp + select instructions to convert -1 to EVL.
@@ -1733,7 +1733,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// Find a suitable type for a stepvector.
ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
- MaskLT.second, MaskTy->getElementCount(),
+ MaskLT.second.getScalarType(), MaskTy->getElementCount(),
/*ZeroIsPoison=*/true, &VScaleRange);
EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index 761c27d2cdc7f..3bd929db1052a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -8,42 +8,42 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:66 Lat:66 SizeLat:66 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:66 Lat:66 SizeLat:66 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:10 SizeLat:10 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
@@ -101,22 +101,22 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:18 Lat:18 SizeLat:18 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -150,22 +150,22 @@ define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found costs of 10 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:34 Lat:34 SizeLat:34 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
diff --git a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
index 392206feb17c1..094d73ddd0581 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
@@ -9,28 +9,28 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 661 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 334 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 661 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 334 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -75,28 +75,28 @@ define void @foo_vscale_range_2_16() vscale_range(2,16) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
>From fe67ffd67addc58a4e03350c61c85e94150d71df Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 14:43:42 +0800
Subject: [PATCH 7/8] Add comment
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 39ebee1a0500a..8a306c7c219d3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5820,6 +5820,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// temporarily, advance store position, before re-loading the final vector.
SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+ /// Expand a CTTZ_ELTS or CTTZ_ELTS_ZERO_POISON by calculating (VL - i) for
+ /// each active lane, getting the maximum and subtracting it from VL.
SDValue expandCttzElts(SDNode *Node, SelectionDAG &DAG) const;
/// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations,
>From b24b8d903a8c473325e74e3828276e645c45982f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 24 Mar 2026 16:08:55 +0800
Subject: [PATCH 8/8] Update comment
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8a306c7c219d3..ec972683735f3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5821,7 +5821,7 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
/// Expand a CTTZ_ELTS or CTTZ_ELTS_ZERO_POISON by calculating (VL - i) for
- /// each active lane, getting the maximum and subtracting it from VL.
+ /// each active lane (i), getting the maximum and subtracting it from VL.
SDValue expandCttzElts(SDNode *Node, SelectionDAG &DAG) const;
/// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations,
More information about the llvm-commits
mailing list